Total coverage: 436721 (25%)of 1765781
15 15 5 10 10 31 30 30 30 30 103 102 99 20 99 17 86 31 75 75 73 75 74 5 75 67 75 68 14 75 66 18 2 17 75 74 13 5 8 63 50 17 75 72 17 75 75 103 102 31 3 3 3 3 2 3 3 3 2 2 2 1 1 1 21 29 29 4 57 25 56 44 21 16 16 16 45 44 21 45 44 57 19 19 19 1 19 10 10 1 19 11 19 19 19 19 11 11 11 11 6 5 4 4 11 1 11 3 18 1 1 8 8 8 3 3 3 3 3 8 60 26 60 58 40 29 13 60 5 60 1 58 1 59 46 59 59 59 59 47 1 60 52 59 60 59 21 60 57 57 18 4 56 20 57 58 3 3 8 7 3 3 3 3 1 1 1 3 3 3 1 3 9 8 1 4 4 3 4 3 3 3 5 5 4 3 3 71 70 46 32 32 28 5 32 59 58 21 71 66 5 6 12 12 12 6 12 12 12 7 5 12 54 54 53 54 71 1 1 70 70 70 70 53 69 70 70 67 70 66 5 5 53 6 59 53 13 53 52 53 53 41 53 8 6 6 6 14 14 14 14 14 6 14 14 14 8 13 14 13 1 14 14 6 14 8 6 6 6 3 1 3 2 83 83 83 14 14 14 14 6 8 6 82 82 65 83 71 27 70 54 27 23 3 2 71 5 1 1 5 66 66 5 76 49 75 59 59 59 59 6 59 5 66 47 66 5 5 5 5 63 82 1 65 6 65 10 10 4 63 10 83 1 82 83 81 76 6 71 76 44 44 83 82 65 65 1 64 2 63 59 59 59 1 59 1 58 23 24 3 3 21 7 7 22 24 18 23 3 9 1 7 15 13 13 11 5 2 7 7 7 7 19 20 20 19 20 15 15 20 24 9 1 23 17 23 22 23 22 21 21 20 20 4 3 20 13 10 15 16 17 17 17 12 7 6 6 7 17 17 20 17 17 17 17 13 10 48 41 6 4 3 3 4 4 50 50 48 45 1 44 24 20 25 33 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 // SPDX-License-Identifier: GPL-2.0 /* * Memory Migration functionality - linux/mm/migrate.c * * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter * * Page migration was first developed in the context of the memory hotplug * project. The main authors of the migration code are: * * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> * Hirokazu Takahashi <taka@valinux.co.jp> * Dave Hansen <haveblue@us.ibm.com> * Christoph Lameter */ #include <linux/migrate.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/mm_inline.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/writeback.h> #include <linux/mempolicy.h> #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/backing-dev.h> #include <linux/compaction.h> #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/hugetlb.h> #include <linux/gfp.h> #include <linux/pfn_t.h> #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> #include <linux/memory.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include <trace/events/migrate.h> #include "internal.h" bool isolate_movable_page(struct page *page, isolate_mode_t mode) { struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; /* * Avoid burning cycles with pages that are yet under __free_pages(), * or just got freed under us. * * In case we 'win' a race for a movable page being freed under us and * raise its refcount preventing __free_pages() from doing its job * the put_page() at the end of this block will take care of * release this page, thus avoiding a nasty leakage. */ if (!folio) goto out; if (unlikely(folio_test_slab(folio))) goto out_putfolio; /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ smp_rmb(); /* * Check movable flag before taking the page lock because * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__folio_test_movable(folio))) goto out_putfolio; /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ smp_rmb(); if (unlikely(folio_test_slab(folio))) goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent * compaction threads can race against page migration functions * as well as race against the releasing a page. * * In order to avoid having an already isolated movable page * being (wrongly) re-isolated while it is under migration, * or to avoid attempting to isolate pages being released, * lets be sure we have the page lock * before proceeding with the movable page isolation steps. */ if (unlikely(!folio_trylock(folio))) goto out_putfolio; if (!folio_test_movable(folio) || folio_test_isolated(folio)) goto out_no_isolated; mops = folio_movable_ops(folio); VM_BUG_ON_FOLIO(!mops, folio); if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; /* Driver shouldn't use the isolated flag */ WARN_ON_ONCE(folio_test_isolated(folio)); folio_set_isolated(folio); folio_unlock(folio); return true; out_no_isolated: folio_unlock(folio); out_putfolio: folio_put(folio); out: return false; } static void putback_movable_folio(struct folio *folio) { const struct movable_operations *mops = folio_movable_ops(folio); mops->putback_page(&folio->page); folio_clear_isolated(folio); } /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() * and isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { struct folio *folio; struct folio *folio2; list_for_each_entry_safe(folio, folio2, l, lru) { if (unlikely(folio_test_hugetlb(folio))) { folio_putback_active_hugetlb(folio); continue; } list_del(&folio->lru); /* * We isolated non-lru movable folio so here we can use * __folio_test_movable because LRU folio's mapping cannot * have PAGE_MAPPING_MOVABLE. */ if (unlikely(__folio_test_movable(folio))) { VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); folio_lock(folio); if (folio_test_movable(folio)) putback_movable_folio(folio); else folio_clear_isolated(folio); folio_unlock(folio); folio_put(folio); } else { node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), -folio_nr_pages(folio)); folio_putback_lru(folio); } } } /* Must be called with an elevated refcount on the non-hugetlb folio */ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) { bool isolated, lru; if (folio_test_hugetlb(folio)) return isolate_hugetlb(folio, list); lru = !__folio_test_movable(folio); if (lru) isolated = folio_isolate_lru(folio); else isolated = isolate_movable_page(&folio->page, ISOLATE_UNEVICTABLE); if (!isolated) return false; list_add(&folio->lru, list); if (lru) node_stat_add_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio)); return true; } static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, struct folio *folio, unsigned long idx) { struct page *page = folio_page(folio, idx); bool contains_data; pte_t newpte; void *addr; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || mm_forbids_zeropage(pvmw->vma->vm_mm)) return false; /* * The pmd entry mapping the old thp was flushed and the pte mapping * this subpage has been non present. If the subpage is only zero-filled * then map it to the shared zeropage. */ addr = kmap_local_page(page); contains_data = memchr_inv(addr, 0, PAGE_SIZE); kunmap_local(addr); if (contains_data) return false; newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), pvmw->vma->vm_page_prot)); set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio)); return true; } struct rmap_walk_arg { struct folio *folio; bool map_unused_to_zeropage; }; /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg) { struct rmap_walk_arg *rmap_walk_arg = arg; DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; pte_t old_pte; pte_t pte; swp_entry_t entry; struct page *new; unsigned long idx = 0; /* pgoff is invalid for ksm pages, but they are never large */ if (folio_test_large(folio) && !folio_test_hugetlb(folio)) idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; new = folio_page(folio, idx); #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION /* PMD-mapped THP migration entry */ if (!pvmw.pte) { VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); remove_migration_pmd(&pvmw, new); continue; } #endif if (rmap_walk_arg->map_unused_to_zeropage && try_to_map_unused_to_zeropage(&pvmw, folio, idx)) continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); old_pte = ptep_get(pvmw.pte); entry = pte_to_swp_entry(old_pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); if (pte_swp_soft_dirty(old_pte)) pte = pte_mksoft_dirty(pte); else pte = pte_clear_soft_dirty(pte); if (is_writable_migration_entry(entry)) pte = pte_mkwrite(pte, vma); else if (pte_swp_uffd_wp(old_pte)) pte = pte_mkuffd_wp(pte); if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; if (unlikely(is_device_private_page(new))) { if (pte_write(pte)) entry = make_writable_device_private_entry( page_to_pfn(new)); else entry = make_readable_device_private_entry( page_to_pfn(new)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(old_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(old_pte)) pte = pte_swp_mkuffd_wp(pte); } #ifdef CONFIG_HUGETLB_PAGE if (folio_test_hugetlb(folio)) { struct hstate *h = hstate_vma(vma); unsigned int shift = huge_page_shift(h); unsigned long psize = huge_page_size(h); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) hugetlb_add_anon_rmap(folio, vma, pvmw.address, rmap_flags); else hugetlb_add_file_rmap(folio); set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, psize); } else #endif { if (folio_test_anon(folio)) folio_add_anon_rmap_pte(folio, new, vma, pvmw.address, rmap_flags); else folio_add_file_rmap_pte(folio, new, vma); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), compound_order(new)); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } return true; } /* * Get rid of all migration entries and replace them by * references to the indicated page. */ void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) { struct rmap_walk_arg rmap_walk_arg = { .folio = src, .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, }; struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = &rmap_walk_arg, }; VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); if (flags & RMP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); } /* * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pte_t *ptep; pte_t pte; swp_entry_t entry; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return; pte = ptep_get(ptep); pte_unmap(ptep); if (!is_swap_pte(pte)) goto out; entry = pte_to_swp_entry(pte); if (!is_migration_entry(entry)) goto out; migration_entry_wait_on_locked(entry, ptl); return; out: spin_unlock(ptl); } #ifdef CONFIG_HUGETLB_PAGE /* * The vma read lock must be held upon entry. Holding that lock prevents either * the pte or the ptl from being freed. * * This function will release the vma lock before returning. */ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); pte_t pte; hugetlb_vma_assert_locked(vma); spin_lock(ptl); pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (unlikely(!is_hugetlb_entry_migration(pte))) { spin_unlock(ptl); hugetlb_vma_unlock_read(vma); } else { /* * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the * pgtable lock released. See comment right above pgtable * lock release in migration_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); } } #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); return; unlock: spin_unlock(ptl); } #endif static int folio_expected_refs(struct address_space *mapping, struct folio *folio) { int refs = 1; if (!mapping) return refs; refs += folio_nr_pages(folio); if (folio_test_private(folio)) refs++; return refs; } /* * Replace the folio in the mapping. * * The number of remaining references must be: * 1 for anonymous folios without a mapping * 2 for folios with a mapping * 3 for folios with a mapping and PagePrivate/PagePrivate2 set. */ static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); long entries, i; if (!mapping) { /* Take off deferred split queue while frozen and memcg set */ if (folio_test_large(folio) && folio_test_large_rmappable(folio)) { if (!folio_ref_freeze(folio, expected_count)) return -EAGAIN; folio_undo_large_rmappable(folio); folio_ref_unfreeze(folio, expected_count); } /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); return MIGRATEPAGE_SUCCESS; } oldzone = folio_zone(folio); newzone = folio_zone(newfolio); xas_lock_irq(&xas); if (!folio_ref_freeze(folio, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; } /* Take off deferred split queue while frozen and memcg set */ folio_undo_large_rmappable(folio); /* * Now we know that no one else is looking at the folio: * no turning back from here. */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; if (folio_test_anon(folio) && folio_test_large(folio)) mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ if (folio_test_swapbacked(folio)) { __folio_set_swapbacked(newfolio); if (folio_test_swapcache(folio)) { folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); } entries = nr; } else { VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); entries = 1; } /* Move dirty while folio refs frozen and newfolio not yet exposed */ dirty = folio_test_dirty(folio); if (dirty) { folio_clear_dirty(folio); folio_set_dirty(newfolio); } /* Swap cache still stores N entries instead of a high-order entry */ for (i = 0; i < entries; i++) { xas_store(&xas, newfolio); xas_next(&xas); } /* * Drop cache reference from old folio by unfreezing * to one less reference. * We know this isn't the last reference. */ folio_ref_unfreeze(folio, expected_count - nr); xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* * If moved to a different zone then also account * the folio for that zone. Other VM counters will be * taken care of when we establish references to the * new folio and drop references to the old folio. * * Note that anonymous folios are accounted for * via NR_FILE_PAGES and NR_ANON_MAPPED if they * are mapped to swap space. */ if (newzone != oldzone) { struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; memcg = folio_memcg(folio); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); if (folio_test_pmd_mappable(folio)) { __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); } } #ifdef CONFIG_SWAP if (folio_test_swapcache(folio)) { __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); } #endif if (dirty && mapping_can_writeback(mapping)) { __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } local_irq_enable(); return MIGRATEPAGE_SUCCESS; } int folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int extra_count) { int expected_count = folio_expected_refs(mapping, folio) + extra_count; if (folio_ref_count(folio) != expected_count) return -EAGAIN; return __folio_migrate_mapping(mapping, newfolio, folio, expected_count); } EXPORT_SYMBOL(folio_migrate_mapping); /* * The expected number of remaining references is the same as that * of folio_migrate_mapping(). */ int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { XA_STATE(xas, &mapping->i_pages, folio_index(src)); int rc, expected_count = folio_expected_refs(mapping, src); if (folio_ref_count(src) != expected_count) return -EAGAIN; rc = folio_mc_copy(dst, src); if (unlikely(rc)) return rc; xas_lock_irq(&xas); if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; } dst->index = src->index; dst->mapping = src->mapping; folio_ref_add(dst, folio_nr_pages(dst)); xas_store(&xas, dst); folio_ref_unfreeze(src, expected_count - folio_nr_pages(src)); xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } /* * Copy the flags and some other ancillary information */ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) { int cpupid; if (folio_test_referenced(folio)) folio_set_referenced(newfolio); if (folio_test_uptodate(folio)) folio_mark_uptodate(newfolio); if (folio_test_clear_active(folio)) { VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio); folio_set_active(newfolio); } else if (folio_test_clear_unevictable(folio)) folio_set_unevictable(newfolio); if (folio_test_workingset(folio)) folio_set_workingset(newfolio); if (folio_test_checked(folio)) folio_set_checked(newfolio); /* * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via * migration entries. We can still have PG_anon_exclusive set on an * effectively unmapped and unreferenced first sub-pages of an * anonymous THP: we can simply copy it here via PG_mappedtodisk. */ if (folio_test_mappedtodisk(folio)) folio_set_mappedtodisk(newfolio); /* Move dirty on pages not done by folio_migrate_mapping() */ if (folio_test_dirty(folio)) folio_set_dirty(newfolio); if (folio_test_young(folio)) folio_set_young(newfolio); if (folio_test_idle(folio)) folio_set_idle(newfolio); /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. */ cpupid = folio_xchg_last_cpupid(folio, -1); /* * For memory tiering mode, when migrate between slow and fast * memory node, reset cpupid, because that is used to record * page access time in slow memory node. */ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { bool f_toptier = node_is_toptier(folio_nid(folio)); bool t_toptier = node_is_toptier(folio_nid(newfolio)); if (f_toptier != t_toptier) cpupid = -1; } folio_xchg_last_cpupid(newfolio, cpupid); folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's * ksm_get_folio() depends upon ksm_migrate_page() and the * swapcache flag. */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); folio_clear_private(folio); /* page->private contains hugetlb specific flags */ if (!folio_test_hugetlb(folio)) folio->private = NULL; /* * If any waiters have accumulated on the new page then * wake them up. */ if (folio_test_writeback(newfolio)) folio_end_writeback(newfolio); /* * PG_readahead shares the same bit with PG_reclaim. The above * end_page_writeback() may clear PG_readahead mistakenly, so set the * bit after that. */ if (folio_test_readahead(folio)) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); pgalloc_tag_copy(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } EXPORT_SYMBOL(folio_migrate_flags); /************************************************************ * Migration functions ***********************************************************/ static int __migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, void *src_private, enum migrate_mode mode) { int rc, expected_count = folio_expected_refs(mapping, src); /* Check whether src does not have extra refs before we do more work */ if (folio_ref_count(src) != expected_count) return -EAGAIN; rc = folio_mc_copy(dst, src); if (unlikely(rc)) return rc; rc = __folio_migrate_mapping(mapping, dst, src, expected_count); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (src_private) folio_attach_private(dst, folio_detach_private(src)); folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. * @dst: The folio to migrate the data to. * @src: The folio containing the current data. * @mode: How to migrate the page. * * Common logic to directly migrate a single LRU folio suitable for * folios that do not use PagePrivate/PagePrivate2. * * Folios are locked upon entry and exit. */ int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ return __migrate_folio(mapping, dst, src, NULL, mode); } EXPORT_SYMBOL(migrate_folio); #ifdef CONFIG_BUFFER_HEAD /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, enum migrate_mode mode) { struct buffer_head *bh = head; struct buffer_head *failed_bh; do { if (!trylock_buffer(bh)) { if (mode == MIGRATE_ASYNC) goto unlock; if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh)) goto unlock; lock_buffer(bh); } bh = bh->b_this_page; } while (bh != head); return true; unlock: /* We failed to lock the buffer and cannot stall. */ failed_bh = bh; bh = head; while (bh != failed_bh) { unlock_buffer(bh); bh = bh->b_this_page; } return false; } static int __buffer_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode, bool check_refs) { struct buffer_head *bh, *head; int rc; int expected_count; head = folio_buffers(src); if (!head) return migrate_folio(mapping, dst, src, mode); /* Check whether page does not have extra refs before we do more work */ expected_count = folio_expected_refs(mapping, src); if (folio_ref_count(src) != expected_count) return -EAGAIN; if (!buffer_migrate_lock_buffers(head, mode)) return -EAGAIN; if (check_refs) { bool busy; bool invalidated = false; recheck_buffers: busy = false; spin_lock(&mapping->i_private_lock); bh = head; do { if (atomic_read(&bh->b_count)) { busy = true; break; } bh = bh->b_this_page; } while (bh != head); if (busy) { if (invalidated) { rc = -EAGAIN; goto unlock_buffers; } spin_unlock(&mapping->i_private_lock); invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; } } rc = filemap_migrate_folio(mapping, dst, src, mode); if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; bh = head; do { folio_set_bh(bh, dst, bh_offset(bh)); bh = bh->b_this_page; } while (bh != head); unlock_buffers: if (check_refs) spin_unlock(&mapping->i_private_lock); bh = head; do { unlock_buffer(bh); bh = bh->b_this_page; } while (bh != head); return rc; } /** * buffer_migrate_folio() - Migration function for folios with buffers. * @mapping: The address space containing @src. * @dst: The folio to migrate to. * @src: The folio to migrate from. * @mode: How to migrate the folio. * * This function can only be used if the underlying filesystem guarantees * that no other references to @src exist. For example attached buffer * heads are accessed only under the folio lock. If your filesystem cannot * provide this guarantee, buffer_migrate_folio_norefs() may be more * appropriate. * * Return: 0 on success or a negative errno on failure. */ int buffer_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return __buffer_migrate_folio(mapping, dst, src, mode, false); } EXPORT_SYMBOL(buffer_migrate_folio); /** * buffer_migrate_folio_norefs() - Migration function for folios with buffers. * @mapping: The address space containing @src. * @dst: The folio to migrate to. * @src: The folio to migrate from. * @mode: How to migrate the folio. * * Like buffer_migrate_folio() except that this variant is more careful * and checks that there are also no buffer head references. This function * is the right one for mappings where buffer heads are directly looked * up and referenced (such as block device mappings). * * Return: 0 on success or a negative errno on failure. */ int buffer_migrate_folio_norefs(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return __buffer_migrate_folio(mapping, dst, src, mode, true); } EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); #endif /* CONFIG_BUFFER_HEAD */ int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return __migrate_folio(mapping, dst, src, folio_get_private(src), mode); } EXPORT_SYMBOL_GPL(filemap_migrate_folio); /* * Writeback a folio to clean the dirty state */ static int writeout(struct address_space *mapping, struct folio *folio) { struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, .for_reclaim = 1 }; int rc; if (!mapping->a_ops->writepage) /* No write method for the address space */ return -EINVAL; if (!folio_clear_dirty_for_io(folio)) /* Someone else already triggered a write */ return -EAGAIN; /* * A dirty folio may imply that the underlying filesystem has * the folio on some queue. So the folio must be clean for * migration. Writeout may mean we lose the lock and the * folio state is no longer what we checked for earlier. * At this point we know that the migration attempt cannot * be successful. */ remove_migration_ptes(folio, folio, 0); rc = mapping->a_ops->writepage(&folio->page, &wbc); if (rc != AOP_WRITEPAGE_ACTIVATE) /* unlocked. Relock */ folio_lock(folio); return (rc < 0) ? -EIO : -EAGAIN; } /* * Default handling if a filesystem does not provide a migration function. */ static int fallback_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { if (folio_test_dirty(src)) { /* Only writeback folios in full synchronous migration */ switch (mode) { case MIGRATE_SYNC: break; default: return -EBUSY; } return writeout(mapping, src); } /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_folio(mapping, dst, src, mode); } /* * Move a page to a newly allocated page * The page is locked and all ptes have been successfully removed. * * The new page will have replaced the old page if this function * is successful. * * Return value: * < 0 - error code * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc = -EAGAIN; bool is_lru = !__folio_test_movable(src); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); if (likely(is_lru)) { struct address_space *mapping = folio_mapping(src); if (!mapping) rc = migrate_folio(mapping, dst, src, mode); else if (mapping_inaccessible(mapping)) rc = -EOPNOTSUPP; else if (mapping->a_ops->migrate_folio) /* * Most folios have a mapping and most filesystems * provide a migrate_folio callback. Anonymous folios * are part of swap space which also has its own * migrate_folio callback. This is the most common path * for page migration. */ rc = mapping->a_ops->migrate_folio(mapping, dst, src, mode); else rc = fallback_migrate_folio(mapping, dst, src, mode); } else { const struct movable_operations *mops; /* * In case of non-lru page, it could be released after * isolation step. In that case, we shouldn't try migration. */ VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); if (!folio_test_movable(src)) { rc = MIGRATEPAGE_SUCCESS; folio_clear_isolated(src); goto out; } mops = folio_movable_ops(src); rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); } /* * When successful, old pagecache src->mapping must be cleared before * src is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { if (__folio_test_movable(src)) { VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); /* * We clear PG_movable under page_lock so any compactor * cannot try to migrate this page. */ folio_clear_isolated(src); } /* * Anonymous and movable src->mapping will be cleared by * free_pages_prepare so don't reset it here for keeping * the type to work PageAnon, for example. */ if (!folio_mapping_flags(src)) src->mapping = NULL; if (likely(!folio_is_zone_device(dst))) flush_dcache_folio(dst); } out: return rc; } /* * To record some information during migration, we use unused private * field of struct folio of the newly allocated destination folio. * This is safe because nobody is using it except us. */ enum { PAGE_WAS_MAPPED = BIT(0), PAGE_WAS_MLOCKED = BIT(1), PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED, }; static void __migrate_folio_record(struct folio *dst, int old_page_state, struct anon_vma *anon_vma) { dst->private = (void *)anon_vma + old_page_state; } static void __migrate_folio_extract(struct folio *dst, int *old_page_state, struct anon_vma **anon_vmap) { unsigned long private = (unsigned long)dst->private; *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES); *old_page_state = private & PAGE_OLD_STATES; dst->private = NULL; } /* Restore the source folio to the original state upon failure */ static void migrate_folio_undo_src(struct folio *src, int page_was_mapped, struct anon_vma *anon_vma, bool locked, struct list_head *ret) { if (page_was_mapped) remove_migration_ptes(src, src, 0); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); if (locked) folio_unlock(src); if (ret) list_move_tail(&src->lru, ret); } /* Restore the destination folio to the original state upon failure */ static void migrate_folio_undo_dst(struct folio *dst, bool locked, free_folio_t put_new_folio, unsigned long private) { if (locked) folio_unlock(dst); if (put_new_folio) put_new_folio(dst, private); else folio_put(dst); } /* Cleanup src folio upon migration success */ static void migrate_folio_done(struct folio *src, enum migrate_reason reason) { /* * Compaction can migrate also non-LRU pages which are * not accounted to NR_ISOLATED_*. They can be recognized * as __folio_test_movable */ if (likely(!__folio_test_movable(src))) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + folio_is_file_lru(src), -folio_nr_pages(src)); if (reason != MR_MEMORY_FAILURE) /* We release the page in page_handle_poison. */ folio_put(src); } /* Obtain the lock on page, remove all ptes. */ static int migrate_folio_unmap(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio **dstp, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; int old_page_state = 0; struct anon_vma *anon_vma = NULL; bool is_lru = data_race(!__folio_test_movable(src)); bool locked = false; bool dst_locked = false; if (folio_ref_count(src) == 1) { /* Folio was freed from under us. So we are done. */ folio_clear_active(src); folio_clear_unevictable(src); /* free_pages_prepare() will clear PG_isolated. */ list_del(&src->lru); migrate_folio_done(src, reason); return MIGRATEPAGE_SUCCESS; } dst = get_new_folio(src, private); if (!dst) return -ENOMEM; *dstp = dst; dst->private = NULL; if (!folio_trylock(src)) { if (mode == MIGRATE_ASYNC) goto out; /* * It's not safe for direct compaction to call lock_page. * For example, during page readahead pages are added locked * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, * avoid the use of lock_page for direct compaction * altogether. */ if (current->flags & PF_MEMALLOC) goto out; /* * In "light" mode, we can wait for transient locks (eg * inserting a page into the page table), but it's not * worth waiting for I/O. */ if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src)) goto out; folio_lock(src); } locked = true; if (folio_test_mlocked(src)) old_page_state |= PAGE_WAS_MLOCKED; if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ switch (mode) { case MIGRATE_SYNC: break; default: rc = -EBUSY; goto out; } folio_wait_writeback(src); } /* * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrate a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. * * Only folio_get_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. * But if we cannot get anon_vma, then we won't need it anyway, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ if (folio_test_anon(src) && !folio_test_ksm(src)) anon_vma = folio_get_anon_vma(src); /* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one * holding a reference to dst at this point. We used to have a BUG * here if folio_trylock(dst) fails, but would like to allow for * cases where there might be a race with the previous use of dst. * This is much like races on refcount of oldpage: just don't BUG(). */ if (unlikely(!folio_trylock(dst))) goto out; dst_locked = true; if (unlikely(!is_lru)) { __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } /* * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. * Calling try_to_unmap() against a src->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory * offlining. Everywhere else except page reclaim, the page is * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed. */ if (!src->mapping) { if (folio_test_private(src)) { try_to_free_buffers(src); goto out; } } else if (folio_mapped(src)) { /* Establish migration ptes */ VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); old_page_state |= PAGE_WAS_MAPPED; } if (!folio_mapped(src)) { __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } out: /* * A folio that has not been unmapped will be restored to * right list unless we want to retry. */ if (rc == -EAGAIN) ret = NULL; migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, anon_vma, locked, ret); migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); return rc; } /* Migrate the folio to the newly allocated folio in dst. */ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio *dst, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { int rc; int old_page_state = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__folio_test_movable(src); struct list_head *prev; __migrate_folio_extract(dst, &old_page_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); rc = move_to_new_folio(dst, src, mode); if (rc) goto out; if (unlikely(!is_lru)) goto out_unlock_both; /* * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will * automatically build up the correct dst->mlock_count for it. * * We would like to do something similar for the old page, when * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ folio_add_lru(dst); if (old_page_state & PAGE_WAS_MLOCKED) lru_add_drain(); if (old_page_state & PAGE_WAS_MAPPED) remove_migration_ptes(src, dst, 0); out_unlock_both: folio_unlock(dst); set_page_owner_migrate_reason(&dst->page, reason); /* * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ folio_put(dst); /* * A folio that has been migrated has all references removed * and will be freed. */ list_del(&src->lru); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); folio_unlock(src); migrate_folio_done(src, reason); return rc; out: /* * A folio that has not been migrated will be restored to * right list unless we want to retry. */ if (rc == -EAGAIN) { list_add(&dst->lru, prev); __migrate_folio_record(dst, old_page_state, anon_vma); return rc; } migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, anon_vma, true, ret); migrate_folio_undo_dst(dst, true, put_new_folio, private); return rc; } /* * Counterpart of unmap_and_move_page() for hugepage migration. * * This function doesn't wait the completion of hugepage I/O * because there is no race between I/O and migration for hugepage. * Note that currently hugepage I/O occurs only in direct I/O * where no lock is held and PG_writeback is irrelevant, * and writeback status of all subpages are counted in the reference * count of the head page (i.e. if all subpages of a 2MB hugepage are * under direct I/O, the reference of the head page is 512 and a bit more.) * This means that when we try to migrate hugepage whose subpages are * doing direct I/O, some references remain after try_to_unmap() and * hugepage migration fails without data corruption. * * There is also no race when direct I/O is issued on the page under migration, * because then pte is replaced with migration swap entry and direct I/O code * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, int force, enum migrate_mode mode, int reason, struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_active_hugetlb(src); return MIGRATEPAGE_SUCCESS; } dst = get_new_folio(src, private); if (!dst) return -ENOMEM; if (!folio_trylock(src)) { if (!force) goto out; switch (mode) { case MIGRATE_SYNC: break; default: goto out; } folio_lock(src); } /* * Check for pages which are in the process of being freed. Without * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ if (hugetlb_folio_subpool(src) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } if (folio_test_anon(src)) anon_vma = folio_get_anon_vma(src); if (unlikely(!folio_trylock(dst))) goto put_anon; if (folio_mapped(src)) { enum ttu_flags ttu = 0; if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially * call huge_pmd_unshare. Because of this, take * semaphore in write mode here and set TTU_RMAP_LOCKED * to let lower levels know we have taken the lock. */ mapping = hugetlb_folio_mapping_lock_write(src); if (unlikely(!mapping)) goto unlock_put_anon; ttu = TTU_RMAP_LOCKED; } try_to_migrate(src, ttu); page_was_mapped = 1; if (ttu & TTU_RMAP_LOCKED) i_mmap_unlock_write(mapping); } if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) remove_migration_ptes(src, rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); unlock_put_anon: folio_unlock(dst); put_anon: if (anon_vma) put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { move_hugetlb_state(src, dst, reason); put_new_folio = NULL; } out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) folio_putback_active_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); /* * If migration was not successful and there's a freeing callback, use * it. Otherwise, put_page() will drop the reference grabbed during * isolation. */ if (put_new_folio) put_new_folio(dst, private); else folio_putback_active_hugetlb(dst); return rc; } static inline int try_split_folio(struct folio *folio, struct list_head *split_folios, enum migrate_mode mode) { int rc; if (mode == MIGRATE_ASYNC) { if (!folio_trylock(folio)) return -EAGAIN; } else { folio_lock(folio); } rc = split_folio_to_list(folio, split_folios); folio_unlock(folio); if (!rc) list_move_tail(&folio->lru, split_folios); return rc; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR #else #define NR_MAX_BATCHED_MIGRATION 512 #endif #define NR_MAX_MIGRATE_PAGES_RETRY 10 #define NR_MAX_MIGRATE_ASYNC_RETRY 3 #define NR_MAX_MIGRATE_SYNC_RETRY \ (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY) struct migrate_pages_stats { int nr_succeeded; /* Normal and large folios migrated successfully, in units of base pages */ int nr_failed_pages; /* Normal and large folios failed to be migrated, in units of base pages. Untried folios aren't counted */ int nr_thp_succeeded; /* THP migrated successfully */ int nr_thp_failed; /* THP failed to be migrated */ int nr_thp_split; /* THP split before migrating */ int nr_split; /* Large folio (include THP) split before migrating */ }; /* * Returns the number of hugetlb folios that were not migrated, or an error code * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable * any more because the list has become empty or no retryable hugetlb folios * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. */ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct migrate_pages_stats *stats, struct list_head *ret_folios) { int retry = 1; int nr_failed = 0; int nr_retry_pages = 0; int pass = 0; struct folio *folio, *folio2; int rc, nr_pages; for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) { retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { if (!folio_test_hugetlb(folio)) continue; nr_pages = folio_nr_pages(folio); cond_resched(); /* * Migratability of hugepages depends on architectures and * their size. This check is necessary because some callers * of hugepage migration like soft offline and memory * hotremove don't walk through page tables or check whether * the hugepage is pmd-based or not before kicking migration. */ if (!hugepage_migration_supported(folio_hstate(folio))) { nr_failed++; stats->nr_failed_pages += nr_pages; list_move_tail(&folio->lru, ret_folios); continue; } rc = unmap_and_move_huge_page(get_new_folio, put_new_folio, private, folio, pass > 2, mode, reason, ret_folios); /* * The rules are: * Success: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { case -ENOMEM: /* * When memory is low, don't bother to try to migrate * other folios, just exit. */ stats->nr_failed_pages += nr_pages + nr_retry_pages; return -ENOMEM; case -EAGAIN: retry++; nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: stats->nr_succeeded += nr_pages; break; default: /* * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed folio is * removed from migration folio list and not * retried in the next outer loop. */ nr_failed++; stats->nr_failed_pages += nr_pages; break; } } } /* * nr_failed is number of hugetlb folios failed to be migrated. After * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb * folios as failed. */ nr_failed += retry; stats->nr_failed_pages += nr_retry_pages; return nr_failed; } /* * migrate_pages_batch() first unmaps folios in the from list as many as * possible, then move the unmapped folios. * * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a * lock or bit when we have locked more than one folio. Which may cause * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the * length of the from list must be <= 1. */ static int migrate_pages_batch(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct list_head *split_folios, struct migrate_pages_stats *stats, int nr_pass) { int retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_retry_pages = 0; int pass = 0; bool is_thp = false; bool is_large = false; struct folio *folio, *folio2, *dst = NULL, *dst2; int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); LIST_HEAD(dst_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && !list_empty(from) && !list_is_singular(from)); for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; thp_retry = 0; nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { is_large = folio_test_large(folio); is_thp = is_large && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); /* * The rare folio on the deferred split list should * be split now. It should not count as a failure: * but increment nr_failed because, without doing so, * migrate_pages() may report success with (split but * unmigrated) pages still on its fromlist; whereas it * always reports success when its fromlist is empty. * stats->nr_thp_failed should be increased too, * otherwise stats inconsistency will happen when * migrate_pages_batch is called via migrate_pages() * with MIGRATE_SYNC and MIGRATE_ASYNC. * * Only check it without removing it from the list. * Since the folio can be on deferred_split_scan() * local list and removing it can cause the local list * corruption. Folio split process below can handle it * with the help of folio_ref_freeze(). * * nr_pages > 2 is needed to avoid checking order-1 * page cache folios. They exist, in contrast to * non-existent order-1 anonymous folios, and do not * use _deferred_list. */ if (nr_pages > 2 && !list_empty(&folio->_deferred_list) && folio_test_partially_mapped(folio)) { if (!try_split_folio(folio, split_folios, mode)) { nr_failed++; stats->nr_thp_failed += is_thp; stats->nr_thp_split += is_thp; stats->nr_split++; continue; } } /* * Large folio migration might be unsupported or * the allocation might be failed so we should retry * on the same folio with the large folio split * to normal folios. * * Split folios are put in split_folios, and * we will migrate them after the rest of the * list is processed. */ if (!thp_migration_supported() && is_thp) { nr_failed++; stats->nr_thp_failed++; if (!try_split_folio(folio, split_folios, mode)) { stats->nr_thp_split++; stats->nr_split++; continue; } stats->nr_failed_pages += nr_pages; list_move_tail(&folio->lru, ret_folios); continue; } rc = migrate_folio_unmap(get_new_folio, put_new_folio, private, folio, &dst, mode, reason, ret_folios); /* * The rules are: * Success: folio will be freed * Unmap: folio will be put on unmap_folios list, * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list */ switch(rc) { case -ENOMEM: /* * When memory is low, don't bother to try to migrate * other folios, move unmapped folios, then exit. */ nr_failed++; stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ if (is_large && !nosplit) { int ret = try_split_folio(folio, split_folios, mode); if (!ret) { stats->nr_thp_split += is_thp; stats->nr_split++; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { /* * Try again to split large folio to * mitigate the failure of longterm pinning. */ retry++; thp_retry += is_thp; nr_retry_pages += nr_pages; /* Undo duplicated failure counting. */ nr_failed--; stats->nr_thp_failed -= is_thp; break; } } stats->nr_failed_pages += nr_pages + nr_retry_pages; /* nr_failed isn't updated for not used */ stats->nr_thp_failed += thp_retry; rc_saved = rc; if (list_empty(&unmap_folios)) goto out; else goto move; case -EAGAIN: retry++; thp_retry += is_thp; nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; case MIGRATEPAGE_UNMAP: list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; default: /* * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed folio is * removed from migration folio list and not * retried in the next outer loop. */ nr_failed++; stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } } } nr_failed += retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; move: /* Flush TLBs for all unmapped folios */ try_to_unmap_flush(); retry = 1; for (pass = 0; pass < nr_pass && retry; pass++) { retry = 0; thp_retry = 0; nr_retry_pages = 0; dst = list_first_entry(&dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); rc = migrate_folio_move(put_new_folio, private, folio, dst, mode, reason, ret_folios); /* * The rules are: * Success: folio will be freed * -EAGAIN: stay on the unmap_folios list * Other errno: put on ret_folios list */ switch(rc) { case -EAGAIN: retry++; thp_retry += is_thp; nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; default: nr_failed++; stats->nr_thp_failed += is_thp; stats->nr_failed_pages += nr_pages; break; } dst = dst2; dst2 = list_next_entry(dst, lru); } } nr_failed += retry; stats->nr_thp_failed += thp_retry; stats->nr_failed_pages += nr_retry_pages; rc = rc_saved ? : nr_failed; out: /* Cleanup remaining folios */ dst = list_first_entry(&dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { int old_page_state = 0; struct anon_vma *anon_vma = NULL; __migrate_folio_extract(dst, &old_page_state, &anon_vma); migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, anon_vma, true, ret_folios); list_del(&dst->lru); migrate_folio_undo_dst(dst, true, put_new_folio, private); dst = dst2; dst2 = list_next_entry(dst, lru); } return rc; } static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, struct list_head *ret_folios, struct list_head *split_folios, struct migrate_pages_stats *stats) { int rc, nr_failed = 0; LIST_HEAD(folios); struct migrate_pages_stats astats; memset(&astats, 0, sizeof(astats)); /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &folios, split_folios, &astats, NR_MAX_MIGRATE_ASYNC_RETRY); stats->nr_succeeded += astats.nr_succeeded; stats->nr_thp_succeeded += astats.nr_thp_succeeded; stats->nr_thp_split += astats.nr_thp_split; stats->nr_split += astats.nr_split; if (rc < 0) { stats->nr_failed_pages += astats.nr_failed_pages; stats->nr_thp_failed += astats.nr_thp_failed; list_splice_tail(&folios, ret_folios); return rc; } stats->nr_thp_failed += astats.nr_thp_split; /* * Do not count rc, as pages will be retried below. * Count nr_split only, since it includes nr_thp_split. */ nr_failed += astats.nr_split; /* * Fall back to migrate all failed folios one by one synchronously. All * failed folios except split THPs will be retried, so their failure * isn't counted */ list_splice_tail_init(&folios, from); while (!list_empty(from)) { list_move(from->next, &folios); rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, ret_folios, split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); list_splice_tail_init(&folios, ret_folios); if (rc < 0) return rc; nr_failed += rc; } return nr_failed; } /* * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration * * @from: The list of folios to be migrated. * @get_new_folio: The function used to allocate free folios to be used * as the target of the folio migration. * @put_new_folio: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_folio() * @mode: The migration mode that specifies the constraints for * folio migration, if any. * @reason: The reason for folio migration. * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios * are movable any more because the list has become empty or no retryable folios * exist any more. It is caller's responsibility to call putback_movable_pages() * only if ret != 0. * * Returns the number of {normal folio, large folio, hugetlb} that were not * migrated, or an error code. The number of large folio splits will be * considered as the number of non-migrated large folio, no matter how many * split folios of the large folio are migrated successfully. */ int migrate_pages(struct list_head *from, new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int rc, rc_gather; int nr_pages; struct folio *folio, *folio2; LIST_HEAD(folios); LIST_HEAD(ret_folios); LIST_HEAD(split_folios); struct migrate_pages_stats stats; trace_mm_migrate_pages_start(mode, reason); memset(&stats, 0, sizeof(stats)); rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, mode, reason, &stats, &ret_folios); if (rc_gather < 0) goto out; again: nr_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { /* Retried hugetlb folios will be kept in list */ if (folio_test_hugetlb(folio)) { list_move_tail(&folio->lru, &ret_folios); continue; } nr_pages += folio_nr_pages(folio); if (nr_pages >= NR_MAX_BATCHED_MIGRATION) break; } if (nr_pages >= NR_MAX_BATCHED_MIGRATION) list_cut_before(&folios, from, &folio2->lru); else list_splice_init(from, &folios); if (mode == MIGRATE_ASYNC) rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats, NR_MAX_MIGRATE_PAGES_RETRY); else rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, private, mode, reason, &ret_folios, &split_folios, &stats); list_splice_tail_init(&folios, &ret_folios); if (rc < 0) { rc_gather = rc; list_splice_tail(&split_folios, &ret_folios); goto out; } if (!list_empty(&split_folios)) { /* * Failure isn't counted since all split folios of a large folio * is counted as 1 failure already. And, we only try to migrate * with minimal effort, force MIGRATE_ASYNC mode and retry once. */ migrate_pages_batch(&split_folios, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, reason, &ret_folios, NULL, &stats, 1); list_splice_tail_init(&split_folios, &ret_folios); } rc_gather += rc; if (!list_empty(from)) goto again; out: /* * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller. */ list_splice(&ret_folios, from); /* * Return 0 in case all split folios of fail-to-migrate large folios * are migrated successfully. */ if (list_empty(from)) rc_gather = 0; count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, stats.nr_thp_succeeded, stats.nr_thp_failed, stats.nr_thp_split, stats.nr_split, mode, reason); if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; return rc_gather; } struct folio *alloc_migration_target(struct folio *src, unsigned long private) { struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; int nid; int zidx; mtc = (struct migration_target_control *)private; gfp_mask = mtc->gfp_mask; nid = mtc->nid; if (nid == NUMA_NO_NODE) nid = folio_nid(src); if (folio_test_hugetlb(src)) { struct hstate *h = folio_hstate(src); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_hugetlb_folio_nodemask(h, nid, mtc->nmask, gfp_mask, htlb_allow_alloc_fallback(mtc->reason)); } if (folio_test_large(src)) { /* * clear __GFP_RECLAIM to make the migration callback * consistent with regular THP allocations. */ gfp_mask &= ~__GFP_RECLAIM; gfp_mask |= GFP_TRANSHUGE; order = folio_order(src); } zidx = zone_idx(folio_zone(src)); if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) gfp_mask |= __GFP_HIGHMEM; return __folio_alloc(gfp_mask, order, nid, mtc->nmask); } #ifdef CONFIG_NUMA static int store_status(int __user *status, int start, int value, int nr) { while (nr-- > 0) { if (put_user(value, status + start)) return -EFAULT; start++; } return 0; } static int do_move_pages_to_node(struct list_head *pagelist, int node) { int err; struct migration_target_control mtc = { .nid = node, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, .reason = MR_SYSCALL, }; err = migrate_pages(pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(pagelist); return err; } static int __add_folio_for_migration(struct folio *folio, int node, struct list_head *pagelist, bool migrate_all) { if (is_zero_folio(folio) || is_huge_zero_folio(folio)) return -EFAULT; if (folio_is_zone_device(folio)) return -ENOENT; if (folio_nid(folio) == node) return 0; if (folio_likely_mapped_shared(folio) && !migrate_all) return -EACCES; if (folio_test_hugetlb(folio)) { if (isolate_hugetlb(folio, pagelist)) return 1; } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); return 1; } return -EBUSY; } /* * Resolves the given address to a struct folio, isolates it from the LRU and * puts it to the given pagelist. * Returns: * errno - if the folio cannot be found/isolated * 0 - when it doesn't have to be migrated because it is already on the * target node * 1 - when it has been queued */ static int add_folio_for_migration(struct mm_struct *mm, const void __user *p, int node, struct list_head *pagelist, bool migrate_all) { struct vm_area_struct *vma; struct folio_walk fw; struct folio *folio; unsigned long addr; int err = -EFAULT; mmap_read_lock(mm); addr = (unsigned long)untagged_addr_remote(mm, p); vma = vma_lookup(mm, addr); if (vma && vma_migratable(vma)) { folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); if (folio) { err = __add_folio_for_migration(folio, node, pagelist, migrate_all); folio_walk_end(&fw, vma); } else { err = -ENOENT; } } mmap_read_unlock(mm); return err; } static int move_pages_and_store_status(int node, struct list_head *pagelist, int __user *status, int start, int i, unsigned long nr_pages) { int err; if (list_empty(pagelist)) return 0; err = do_move_pages_to_node(pagelist, node); if (err) { /* * Positive err means the number of failed * pages to migrate. Since we are going to * abort and return the number of non-migrated * pages, so need to include the rest of the * nr_pages that have not been attempted as * well. */ if (err > 0) err += nr_pages - i; return err; } return store_status(status, start, node, i - start); } /* * Migrate an array of page address onto an array of nodes and fill * the corresponding array of status. */ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, unsigned long nr_pages, const void __user * __user *pages, const int __user *nodes, int __user *status, int flags) { compat_uptr_t __user *compat_pages = (void __user *)pages; int current_node = NUMA_NO_NODE; LIST_HEAD(pagelist); int start, i; int err = 0, err1; lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; int node; err = -EFAULT; if (in_compat_syscall()) { compat_uptr_t cp; if (get_user(cp, compat_pages + i)) goto out_flush; p = compat_ptr(cp); } else { if (get_user(p, pages + i)) goto out_flush; } if (get_user(node, nodes + i)) goto out_flush; err = -ENODEV; if (node < 0 || node >= MAX_NUMNODES) goto out_flush; if (!node_state(node, N_MEMORY)) goto out_flush; err = -EACCES; if (!node_isset(node, task_nodes)) goto out_flush; if (current_node == NUMA_NO_NODE) { current_node = node; start = i; } else if (node != current_node) { err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) goto out; start = i; current_node = node; } /* * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ err = add_folio_for_migration(mm, p, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ continue; } /* * The move_pages() man page does not have an -EEXIST choice, so * use -EFAULT instead. */ if (err == -EEXIST) err = -EFAULT; /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. */ err = store_status(status, i, err ? : current_node, 1); if (err) goto out_flush; err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) { /* We have accounted for page i */ if (err > 0) err--; goto out; } current_node = NUMA_NO_NODE; } out_flush: /* Make sure we do not overwrite the existing error */ err1 = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err >= 0) err = err1; out: lru_cache_enable(); return err; } /* * Determine the nodes of an array of pages and store it in an array of status. */ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, const void __user **pages, int *status) { unsigned long i; mmap_read_lock(mm); for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); struct vm_area_struct *vma; struct folio_walk fw; struct folio *folio; int err = -EFAULT; vma = vma_lookup(mm, addr); if (!vma) goto set_status; folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); if (folio) { if (is_zero_folio(folio) || is_huge_zero_folio(folio)) err = -EFAULT; else if (folio_is_zone_device(folio)) err = -ENOENT; else err = folio_nid(folio); folio_walk_end(&fw, vma); } else { err = -ENOENT; } set_status: *status = err; pages++; status++; } mmap_read_unlock(mm); } static int get_compat_pages_array(const void __user *chunk_pages[], const void __user * __user *pages, unsigned long chunk_nr) { compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; compat_uptr_t p; int i; for (i = 0; i < chunk_nr; i++) { if (get_user(p, pages32 + i)) return -EFAULT; chunk_pages[i] = compat_ptr(p); } return 0; } /* * Determine the nodes of a user array of pages and store it in * a user array of status. */ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, const void __user * __user *pages, int __user *status) { #define DO_PAGES_STAT_CHUNK_NR 16UL const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; while (nr_pages) { unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR); if (in_compat_syscall()) { if (get_compat_pages_array(chunk_pages, pages, chunk_nr)) break; } else { if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) break; } do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) break; pages += chunk_nr; status += chunk_nr; nr_pages -= chunk_nr; } return nr_pages ? -EFAULT : 0; } static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) { struct task_struct *task; struct mm_struct *mm; /* * There is no need to check if current process has the right to modify * the specified process when they are same. */ if (!pid) { mmget(current->mm); *mem_nodes = cpuset_mems_allowed(current); return current->mm; } task = find_get_task_by_vpid(pid); if (!task) { return ERR_PTR(-ESRCH); } /* * Check if this process has the right to modify the specified * process. Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { mm = ERR_PTR(-EPERM); goto out; } mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) goto out; *mem_nodes = cpuset_mems_allowed(task); mm = get_task_mm(task); out: put_task_struct(task); if (!mm) mm = ERR_PTR(-EINVAL); return mm; } /* * Move a list of pages in the address space of the currently executing * process. */ static int kernel_move_pages(pid_t pid, unsigned long nr_pages, const void __user * __user *pages, const int __user *nodes, int __user *status, int flags) { struct mm_struct *mm; int err; nodemask_t task_nodes; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; mm = find_mm_struct(pid, &task_nodes); if (IS_ERR(mm)) return PTR_ERR(mm); if (nodes) err = do_pages_move(mm, task_nodes, nr_pages, pages, nodes, status, flags); else err = do_pages_stat(mm, nr_pages, pages, status); mmput(mm); return err; } SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, const void __user * __user *, pages, const int __user *, nodes, int __user *, status, int, flags) { return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA * pages. Currently it only checks the watermarks which is crude. */ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, unsigned long nr_migrate_pages) { int z; for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; if (!managed_zone(zone)) continue; /* Avoid waking kswapd by allocating pages_to_migrate pages. */ if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, ZONE_MOVABLE, ALLOC_CMA)) continue; return true; } return false; } static struct folio *alloc_misplaced_dst_folio(struct folio *src, unsigned long data) { int nid = (int) data; int order = folio_order(src); gfp_t gfp = __GFP_THISNODE; if (order > 0) gfp |= GFP_TRANSHUGE_LIGHT; else { gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; gfp &= ~__GFP_RECLAIM; } return __folio_alloc_node(gfp, order, nid); } /* * Prepare for calling migrate_misplaced_folio() by isolating the folio if * permitted. Must be called with the PTL still held. */ int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { int nr_pages = folio_nr_pages(folio); pg_data_t *pgdat = NODE_DATA(node); if (folio_is_file_lru(folio)) { /* * Do not migrate file folios that are mapped in multiple * processes with execute permissions as they are probably * shared libraries. * * See folio_likely_mapped_shared() on possible imprecision * when we cannot easily detect if a folio is shared. */ if ((vma->vm_flags & VM_EXEC) && folio_likely_mapped_shared(folio)) return -EACCES; /* * Do not migrate dirty folios as not all filesystems can move * dirty folios in MIGRATE_ASYNC mode which is a waste of * cycles. */ if (folio_test_dirty(folio)) return -EAGAIN; } /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, nr_pages)) { int z; if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) return -EAGAIN; for (z = pgdat->nr_zones - 1; z >= 0; z--) { if (managed_zone(pgdat->node_zones + z)) break; } /* * If there are no managed zones, it should not proceed * further. */ if (z < 0) return -EAGAIN; wakeup_kswapd(pgdat->node_zones + z, 0, folio_order(folio), ZONE_MOVABLE); return -EAGAIN; } if (!folio_isolate_lru(folio)) return -EAGAIN; node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), nr_pages); return 0; } /* * Attempt to migrate a misplaced folio to the specified destination * node. Caller is expected to have isolated the folio by calling * migrate_misplaced_folio_prepare(), which will result in an * elevated reference count on the folio. This function will un-isolate the * folio, dereferencing the folio before returning. */ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node) { pg_data_t *pgdat = NODE_DATA(node); int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); list_add(&folio->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining && !list_empty(&migratepages)) putback_movable_pages(&migratepages); if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded); if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(folio_nid(folio)) && node_is_toptier(node)) mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded); } mem_cgroup_put(memcg); BUG_ON(!list_empty(&migratepages)); return nr_remaining ? -EAGAIN : 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */
67 65 66 66 66 66 66 67 57 52 1 49 1 57 42 42 39 38 40 79 78 75 75 57 56 54 53 2 52 50 18 66 65 64 64 64 63 60 1 11 61 61 61 50 51 41 42 40 10 9 52 52 56 56 57 57 70 80 22 57 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-or-later /* Crypto operations using stored keys * * Copyright (c) 2016, Intel Corporation */ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/scatterlist.h> #include <linux/crypto.h> #include <crypto/hash.h> #include <crypto/kpp.h> #include <crypto/dh.h> #include <crypto/kdf_sp800108.h> #include <keys/user-type.h> #include "internal.h" static ssize_t dh_data_from_key(key_serial_t keyid, const void **data) { struct key *key; key_ref_t key_ref; long status; ssize_t ret; key_ref = lookup_user_key(keyid, 0, KEY_NEED_READ); if (IS_ERR(key_ref)) { ret = -ENOKEY; goto error; } key = key_ref_to_ptr(key_ref); ret = -EOPNOTSUPP; if (key->type == &key_type_user) { down_read(&key->sem); status = key_validate(key); if (status == 0) { const struct user_key_payload *payload; uint8_t *duplicate; payload = user_key_payload_locked(key); duplicate = kmemdup(payload->data, payload->datalen, GFP_KERNEL); if (duplicate) { *data = duplicate; ret = payload->datalen; } else { ret = -ENOMEM; } } up_read(&key->sem); } key_put(key); error: return ret; } static void dh_free_data(struct dh *dh) { kfree_sensitive(dh->key); kfree_sensitive(dh->p); kfree_sensitive(dh->g); } static int kdf_alloc(struct crypto_shash **hash, char *hashname) { struct crypto_shash *tfm; /* allocate synchronous hash */ tfm = crypto_alloc_shash(hashname, 0, 0); if (IS_ERR(tfm)) { pr_info("could not allocate digest TFM handle %s\n", hashname); return PTR_ERR(tfm); } if (crypto_shash_digestsize(tfm) == 0) { crypto_free_shash(tfm); return -EINVAL; } *hash = tfm; return 0; } static void kdf_dealloc(struct crypto_shash *hash) { if (hash) crypto_free_shash(hash); } static int keyctl_dh_compute_kdf(struct crypto_shash *hash, char __user *buffer, size_t buflen, uint8_t *kbuf, size_t kbuflen) { struct kvec kbuf_iov = { .iov_base = kbuf, .iov_len = kbuflen }; uint8_t *outbuf = NULL; int ret; size_t outbuf_len = roundup(buflen, crypto_shash_digestsize(hash)); outbuf = kmalloc(outbuf_len, GFP_KERNEL); if (!outbuf) { ret = -ENOMEM; goto err; } ret = crypto_kdf108_ctr_generate(hash, &kbuf_iov, 1, outbuf, outbuf_len); if (ret) goto err; ret = buflen; if (copy_to_user(buffer, outbuf, buflen) != 0) ret = -EFAULT; err: kfree_sensitive(outbuf); return ret; } long __keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params *kdfcopy) { long ret; ssize_t dlen; int secretlen; int outlen; struct keyctl_dh_params pcopy; struct dh dh_inputs; struct scatterlist outsg; DECLARE_CRYPTO_WAIT(compl); struct crypto_kpp *tfm; struct kpp_request *req; uint8_t *secret; uint8_t *outbuf; struct crypto_shash *hash = NULL; if (!params || (!buffer && buflen)) { ret = -EINVAL; goto out1; } if (copy_from_user(&pcopy, params, sizeof(pcopy)) != 0) { ret = -EFAULT; goto out1; } if (kdfcopy) { char *hashname; if (memchr_inv(kdfcopy->__spare, 0, sizeof(kdfcopy->__spare))) { ret = -EINVAL; goto out1; } if (buflen > KEYCTL_KDF_MAX_OUTPUT_LEN || kdfcopy->otherinfolen > KEYCTL_KDF_MAX_OI_LEN) { ret = -EMSGSIZE; goto out1; } /* get KDF name string */ hashname = strndup_user(kdfcopy->hashname, CRYPTO_MAX_ALG_NAME); if (IS_ERR(hashname)) { ret = PTR_ERR(hashname); goto out1; } /* allocate KDF from the kernel crypto API */ ret = kdf_alloc(&hash, hashname); kfree(hashname); if (ret) goto out1; } memset(&dh_inputs, 0, sizeof(dh_inputs)); dlen = dh_data_from_key(pcopy.prime, &dh_inputs.p); if (dlen < 0) { ret = dlen; goto out1; } dh_inputs.p_size = dlen; dlen = dh_data_from_key(pcopy.base, &dh_inputs.g); if (dlen < 0) { ret = dlen; goto out2; } dh_inputs.g_size = dlen; dlen = dh_data_from_key(pcopy.private, &dh_inputs.key); if (dlen < 0) { ret = dlen; goto out2; } dh_inputs.key_size = dlen; secretlen = crypto_dh_key_len(&dh_inputs); secret = kmalloc(secretlen, GFP_KERNEL); if (!secret) { ret = -ENOMEM; goto out2; } ret = crypto_dh_encode_key(secret, secretlen, &dh_inputs); if (ret) goto out3; tfm = crypto_alloc_kpp("dh", 0, 0); if (IS_ERR(tfm)) { ret = PTR_ERR(tfm); goto out3; } ret = crypto_kpp_set_secret(tfm, secret, secretlen); if (ret) goto out4; outlen = crypto_kpp_maxsize(tfm); if (!kdfcopy) { /* * When not using a KDF, buflen 0 is used to read the * required buffer length */ if (buflen == 0) { ret = outlen; goto out4; } else if (outlen > buflen) { ret = -EOVERFLOW; goto out4; } } outbuf = kzalloc(kdfcopy ? (outlen + kdfcopy->otherinfolen) : outlen, GFP_KERNEL); if (!outbuf) { ret = -ENOMEM; goto out4; } sg_init_one(&outsg, outbuf, outlen); req = kpp_request_alloc(tfm, GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out5; } kpp_request_set_input(req, NULL, 0); kpp_request_set_output(req, &outsg, outlen); kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &compl); /* * For DH, generate_public_key and generate_shared_secret are * the same calculation */ ret = crypto_kpp_generate_public_key(req); ret = crypto_wait_req(ret, &compl); if (ret) goto out6; if (kdfcopy) { /* * Concatenate SP800-56A otherinfo past DH shared secret -- the * input to the KDF is (DH shared secret || otherinfo) */ if (copy_from_user(outbuf + req->dst_len, kdfcopy->otherinfo, kdfcopy->otherinfolen) != 0) { ret = -EFAULT; goto out6; } ret = keyctl_dh_compute_kdf(hash, buffer, buflen, outbuf, req->dst_len + kdfcopy->otherinfolen); } else if (copy_to_user(buffer, outbuf, req->dst_len) == 0) { ret = req->dst_len; } else { ret = -EFAULT; } out6: kpp_request_free(req); out5: kfree_sensitive(outbuf); out4: crypto_free_kpp(tfm); out3: kfree_sensitive(secret); out2: dh_free_data(&dh_inputs); out1: kdf_dealloc(hash); return ret; } long keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { struct keyctl_kdf_params kdfcopy; if (!kdf) return __keyctl_dh_compute(params, buffer, buflen, NULL); if (copy_from_user(&kdfcopy, kdf, sizeof(kdfcopy)) != 0) return -EFAULT; return __keyctl_dh_compute(params, buffer, buflen, &kdfcopy); }
6730 477 38 3111 1136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H #include <linux/kernel.h> #include <linux/preempt.h> #include <linux/atomic.h> #include <linux/bug.h> /* * bit-based spin_lock() * * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ static inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters * the body of the outer loop. If it is contended, then * within the inner loop a non-atomic test is used to * busywait with less bus contention for a good time to * attempt to acquire the lock bit. */ preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); do { cpu_relax(); } while (test_bit(bitnum, addr)); preempt_disable(); } #endif __acquire(bitlock); } /* * Return true if it was acquired */ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) if (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); return 0; } #endif __acquire(bitlock); return 1; } /* * bit-based spin_unlock() */ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * bit-based spin_unlock() * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) __clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * Return true if the lock is held. */ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); #elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; #endif } #endif /* __LINUX_BIT_SPINLOCK_H */
71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef FS_MINIX_H #define FS_MINIX_H #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/minix_fs.h> #define INODE_VERSION(inode) minix_sb(inode->i_sb)->s_version #define MINIX_V1 0x0001 /* original minix fs */ #define MINIX_V2 0x0002 /* minix V2 fs */ #define MINIX_V3 0x0003 /* minix V3 fs */ /* * minix fs inode data in memory */ struct minix_inode_info { union { __u16 i1_data[16]; __u32 i2_data[16]; } u; struct inode vfs_inode; }; /* * minix super-block data in memory */ struct minix_sb_info { unsigned long s_ninodes; unsigned long s_nzones; unsigned long s_imap_blocks; unsigned long s_zmap_blocks; unsigned long s_firstdatazone; unsigned long s_log_zone_size; int s_dirsize; int s_namelen; struct buffer_head ** s_imap; struct buffer_head ** s_zmap; struct buffer_head * s_sbh; struct minix_super_block * s_ms; unsigned short s_mount_state; unsigned short s_version; }; struct inode *minix_iget(struct super_block *, unsigned long); struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); struct inode *minix_new_inode(const struct inode *, umode_t); void minix_free_inode(struct inode *inode); unsigned long minix_count_free_inodes(struct super_block *sb); int minix_new_block(struct inode *inode); void minix_free_block(struct inode *inode, unsigned long block); unsigned long minix_count_free_blocks(struct super_block *sb); int minix_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); extern void V1_minix_truncate(struct inode *); extern void V2_minix_truncate(struct inode *); extern void minix_truncate(struct inode *); extern void minix_set_inode(struct inode *, dev_t); extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); extern unsigned V1_minix_blocks(loff_t, struct super_block *); extern unsigned V2_minix_blocks(loff_t, struct super_block *); struct minix_dir_entry *minix_find_entry(struct dentry *, struct folio **); int minix_add_link(struct dentry*, struct inode*); int minix_delete_entry(struct minix_dir_entry *, struct folio *); int minix_make_empty(struct inode*, struct inode*); int minix_empty_dir(struct inode*); int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode); struct minix_dir_entry *minix_dotdot(struct inode*, struct folio **); ino_t minix_inode_by_name(struct dentry*); extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; extern const struct file_operations minix_file_operations; extern const struct file_operations minix_dir_operations; static inline struct minix_sb_info *minix_sb(struct super_block *sb) { return sb->s_fs_info; } static inline struct minix_inode_info *minix_i(struct inode *inode) { return container_of(inode, struct minix_inode_info, vfs_inode); } static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) { return DIV_ROUND_UP(bits, blocksize * 8); } #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) #error Minix file system byte order broken #elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) /* * big-endian 32 or 64 bit indexed bitmaps on big-endian system or * little-endian bitmaps on little-endian system */ #define minix_test_and_set_bit(nr, addr) \ __test_and_set_bit((nr), (unsigned long *)(addr)) #define minix_set_bit(nr, addr) \ __set_bit((nr), (unsigned long *)(addr)) #define minix_test_and_clear_bit(nr, addr) \ __test_and_clear_bit((nr), (unsigned long *)(addr)) #define minix_test_bit(nr, addr) \ test_bit((nr), (unsigned long *)(addr)) #define minix_find_first_zero_bit(addr, size) \ find_first_zero_bit((unsigned long *)(addr), (size)) #elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) /* * big-endian 16bit indexed bitmaps */ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) { const unsigned short *p = vaddr, *addr = vaddr; unsigned short num; if (!size) return 0; size >>= 4; while (*p++ == 0xffff) { if (--size == 0) return (p - addr) << 4; } num = *--p; return ((p - addr) << 4) + ffz(num); } #define minix_test_and_set_bit(nr, addr) \ __test_and_set_bit((nr) ^ 16, (unsigned long *)(addr)) #define minix_set_bit(nr, addr) \ __set_bit((nr) ^ 16, (unsigned long *)(addr)) #define minix_test_and_clear_bit(nr, addr) \ __test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr)) static inline int minix_test_bit(int nr, const void *vaddr) { const unsigned short *p = vaddr; return (p[nr >> 4] & (1U << (nr & 15))) != 0; } #else /* * little-endian bitmaps */ #define minix_test_and_set_bit __test_and_set_bit_le #define minix_set_bit __set_bit_le #define minix_test_and_clear_bit __test_and_clear_bit_le #define minix_test_bit test_bit_le #define minix_find_first_zero_bit find_first_zero_bit_le #endif #endif /* FS_MINIX_H */
1071 966 1071 1069 831 833 964 1072 164 1071 1072 1070 1066 1071 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0 /* * jump label x86 support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * */ #include <linux/jump_label.h> #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/cpu.h> #include <asm/kprobes.h> #include <asm/alternative.h> #include <asm/text-patching.h> #include <asm/insn.h> int arch_jump_entry_size(struct jump_entry *entry) { struct insn insn = {}; insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); BUG_ON(insn.length != 2 && insn.length != 5); return insn.length; } struct jump_label_patch { const void *code; int size; }; static struct jump_label_patch __jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { const void *expect, *code, *nop; const void *addr, *dest; int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); size = arch_jump_entry_size(entry); switch (size) { case JMP8_INSN_SIZE: code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; case JMP32_INSN_SIZE: code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; default: BUG(); } if (type == JUMP_LABEL_JMP) expect = nop; else expect = code; if (memcmp(addr, expect, size)) { /* * The location is not an op that we were expecting. * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", addr, addr, addr, expect, size, type); BUG(); } if (type == JUMP_LABEL_NOP) code = nop; return (struct jump_label_patch){.code = code, .size = size}; } static __always_inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that * system_state is SYSTEM_BOOTING guarantees it. It will be set to * SYSTEM_SCHEDULING before other cores are awaken and before the * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { mutex_lock(&text_mutex); __jump_label_transform(entry, type, init); mutex_unlock(&text_mutex); } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { jump_label_transform(entry, type, 0); } bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* * Fallback to the non-batching mode. */ arch_jump_label_transform(entry, type); return true; } mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); text_poke_finish(); mutex_unlock(&text_mutex); }
21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 /* * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ /* zstd_ddict.c : * concentrates all logic that needs to know the internals of ZSTD_DDict object */ /*-******************************************************* * Dependencies *********************************************************/ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ #include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" #define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" #include "zstd_decompress_internal.h" #include "zstd_ddict.h" /*-******************************************************* * Types *********************************************************/ struct ZSTD_DDict_s { void* dictBuffer; const void* dictContent; size_t dictSize; ZSTD_entropyDTables_t entropy; U32 dictID; U32 entropyPresent; ZSTD_customMem cMem; }; /* typedef'd to ZSTD_DDict within "zstd.h" */ const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict) { assert(ddict != NULL); return ddict->dictContent; } size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict) { assert(ddict != NULL); return ddict->dictSize; } void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) { DEBUGLOG(4, "ZSTD_copyDDictParameters"); assert(dctx != NULL); assert(ddict != NULL); dctx->dictID = ddict->dictID; dctx->prefixStart = ddict->dictContent; dctx->virtualStart = ddict->dictContent; dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize; dctx->previousDstEnd = dctx->dictEnd; #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION dctx->dictContentBeginForFuzzing = dctx->prefixStart; dctx->dictContentEndForFuzzing = dctx->previousDstEnd; #endif if (ddict->entropyPresent) { dctx->litEntropy = 1; dctx->fseEntropy = 1; dctx->LLTptr = ddict->entropy.LLTable; dctx->MLTptr = ddict->entropy.MLTable; dctx->OFTptr = ddict->entropy.OFTable; dctx->HUFptr = ddict->entropy.hufTable; dctx->entropy.rep[0] = ddict->entropy.rep[0]; dctx->entropy.rep[1] = ddict->entropy.rep[1]; dctx->entropy.rep[2] = ddict->entropy.rep[2]; } else { dctx->litEntropy = 0; dctx->fseEntropy = 0; } } static size_t ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict, ZSTD_dictContentType_e dictContentType) { ddict->dictID = 0; ddict->entropyPresent = 0; if (dictContentType == ZSTD_dct_rawContent) return 0; if (ddict->dictSize < 8) { if (dictContentType == ZSTD_dct_fullDict) return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ return 0; /* pure content mode */ } { U32 const magic = MEM_readLE32(ddict->dictContent); if (magic != ZSTD_MAGIC_DICTIONARY) { if (dictContentType == ZSTD_dct_fullDict) return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ return 0; /* pure content mode */ } } ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE); /* load entropy tables */ RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy( &ddict->entropy, ddict->dictContent, ddict->dictSize)), dictionary_corrupted, ""); ddict->entropyPresent = 1; return 0; } static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) { if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) { ddict->dictBuffer = NULL; ddict->dictContent = dict; if (!dict) dictSize = 0; } else { void* const internalBuffer = ZSTD_customMalloc(dictSize, ddict->cMem); ddict->dictBuffer = internalBuffer; ddict->dictContent = internalBuffer; if (!internalBuffer) return ERROR(memory_allocation); ZSTD_memcpy(internalBuffer, dict, dictSize); } ddict->dictSize = dictSize; ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ /* parse dictionary content */ FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); return 0; } ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_customMalloc(sizeof(ZSTD_DDict), customMem); if (ddict == NULL) return NULL; ddict->cMem = customMem; { size_t const initResult = ZSTD_initDDict_internal(ddict, dict, dictSize, dictLoadMethod, dictContentType); if (ZSTD_isError(initResult)) { ZSTD_freeDDict(ddict); return NULL; } } return ddict; } } /*! ZSTD_createDDict() : * Create a digested dictionary, to start decompression without startup delay. * `dict` content is copied inside DDict. * Consequently, `dict` can be released after `ZSTD_DDict` creation */ ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize) { ZSTD_customMem const allocator = { NULL, NULL, NULL }; return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator); } /*! ZSTD_createDDict_byReference() : * Create a digested dictionary, to start decompression without startup delay. * Dictionary content is simply referenced, it will be accessed during decompression. * Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */ ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize) { ZSTD_customMem const allocator = { NULL, NULL, NULL }; return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator); } const ZSTD_DDict* ZSTD_initStaticDDict( void* sBuffer, size_t sBufferSize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) { size_t const neededSpace = sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer; assert(sBuffer != NULL); assert(dict != NULL); if ((size_t)sBuffer & 7) return NULL; /* 8-aligned */ if (sBufferSize < neededSpace) return NULL; if (dictLoadMethod == ZSTD_dlm_byCopy) { ZSTD_memcpy(ddict+1, dict, dictSize); /* local copy */ dict = ddict+1; } if (ZSTD_isError( ZSTD_initDDict_internal(ddict, dict, dictSize, ZSTD_dlm_byRef, dictContentType) )) return NULL; return ddict; } size_t ZSTD_freeDDict(ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support free on NULL */ { ZSTD_customMem const cMem = ddict->cMem; ZSTD_customFree(ddict->dictBuffer, cMem); ZSTD_customFree(ddict, cMem); return 0; } } /*! ZSTD_estimateDDictSize() : * Estimate amount of memory that will be needed to create a dictionary for decompression. * Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */ size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod) { return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); } size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; /* support sizeof on NULL */ return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ; } /*! ZSTD_getDictID_fromDDict() : * Provides the dictID of the dictionary loaded into `ddict`. * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) { if (ddict==NULL) return 0; return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); }
2135 1096 2052 1934 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAN_H #define _LINUX_MMAN_H #include <linux/mm.h> #include <linux/percpu_counter.h> #include <linux/atomic.h> #include <uapi/linux/mman.h> /* * Arrange for legacy / undefined architecture specific flags to be * ignored by mmap handling code. */ #ifndef MAP_32BIT #define MAP_32BIT 0 #endif #ifndef MAP_ABOVE4G #define MAP_ABOVE4G 0 #endif #ifndef MAP_HUGE_2MB #define MAP_HUGE_2MB 0 #endif #ifndef MAP_HUGE_1GB #define MAP_HUGE_1GB 0 #endif #ifndef MAP_UNINITIALIZED #define MAP_UNINITIALIZED 0 #endif #ifndef MAP_SYNC #define MAP_SYNC 0 #endif /* * The historical set of flags that all mmap implementations implicitly * support when a ->mmap_validate() op is not provided in file_operations. * * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the * kernel. */ #define LEGACY_MAP_MASK (MAP_SHARED \ | MAP_PRIVATE \ | MAP_FIXED \ | MAP_ANONYMOUS \ | MAP_DENYWRITE \ | MAP_EXECUTABLE \ | MAP_UNINITIALIZED \ | MAP_GROWSDOWN \ | MAP_LOCKED \ | MAP_NORESERVE \ | MAP_POPULATE \ | MAP_NONBLOCK \ | MAP_STACK \ | MAP_HUGETLB \ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ | MAP_HUGE_1GB) extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP extern s32 vm_committed_as_batch; extern void mm_compute_batch(int overcommit_policy); #else #define vm_committed_as_batch 0 static inline void mm_compute_batch(int overcommit_policy) { } #endif unsigned long vm_memory_committed(void); static inline void vm_acct_memory(long pages) { percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); } static inline void vm_unacct_memory(long pages) { vm_acct_memory(-pages); } /* * Allow architectures to handle additional protection and flag bits. The * overriding macros must be defined in the arch-specific asm/mman.h file. */ #ifndef arch_calc_vm_prot_bits #define arch_calc_vm_prot_bits(prot, pkey) 0 #endif #ifndef arch_calc_vm_flag_bits #define arch_calc_vm_flag_bits(flags) 0 #endif #ifndef arch_validate_prot /* * This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have * already been masked out. * * Returns true if the prot flags are valid */ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0; } #define arch_validate_prot arch_validate_prot #endif #ifndef arch_validate_flags /* * This is called from mmap() and mprotect() with the updated vma->vm_flags. * * Returns true if the VM_* flags are valid. */ static inline bool arch_validate_flags(unsigned long flags) { return true; } #define arch_validate_flags arch_validate_flags #endif /* * Optimisation macro. It is equivalent to: * (x & bit1) ? bit2 : 0 * but this version is faster. * ("bit1" and "bit2" must be single bits) */ #define _calc_vm_trans(x, bit1, bit2) \ ((!(bit1) || !(bit2)) ? 0 : \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ : ((x) & (bit1)) / ((bit1) / (bit2)))) /* * Combine the mmap "prot" argument into "vm_flags" used internally. */ static inline unsigned long calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { return _calc_vm_trans(prot, PROT_READ, VM_READ ) | _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | arch_calc_vm_prot_bits(prot, pkey); } /* * Combine the mmap "flags" argument into "vm_flags" used internally. */ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | arch_calc_vm_flag_bits(flags); } unsigned long vm_commit_limit(void); #ifndef arch_memory_deny_write_exec_supported static inline bool arch_memory_deny_write_exec_supported(void) { return true; } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif /* * Denies creating a writable executable mapping or gaining executable permissions. * * This denies the following: * * a) mmap(PROT_WRITE | PROT_EXEC) * * b) mmap(PROT_WRITE) * mprotect(PROT_EXEC) * * c) mmap(PROT_WRITE) * mprotect(PROT_READ) * mprotect(PROT_EXEC) * * But allows the following: * * d) mmap(PROT_READ | PROT_EXEC) * mmap(PROT_READ | PROT_EXEC | PROT_BTI) */ static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_HAS_MDWE, &current->mm->flags)) return false; if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) return true; if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) return true; return false; } #endif /* _LINUX_MMAN_H */
3 3 3 3 3 3 3 3 3 451 449 447 450 310 40 429 426 393 396 394 3 3 3 418 470 470 470 52 1 450 451 450 451 448 451 405 475 470 1 475 7 471 471 472 475 476 476 476 476 406 416 28 27 28 26 2 1 27 26 27 472 475 324 8 2 490 489 29 28 474 430 429 337 226 121 489 489 492 488 488 42 2 491 83 32 475 475 448 439 433 407 53 53 53 41 40 41 41 40 39 39 39 32 10 10 10 10 1 1 24 24 21 3 3 4 10 3 3 2 10 24 41 29 26 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/swapops.h> /* * We want to know the real level where a entry is located ignoring any * folding of levels which may be happening. For example if p4d is folded then * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). */ static int real_depth(int depth) { if (depth == 3 && PTRS_PER_PMD == 1) depth = 2; if (depth == 2 && PTRS_PER_PUD == 1) depth = 1; if (depth == 1 && PTRS_PER_P4D == 1) depth = 0; return depth; } static int walk_pte_range_inner(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { const struct mm_walk_ops *ops = walk->ops; int err = 0; for (;;) { err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; if (addr >= end - PAGE_SIZE) break; addr += PAGE_SIZE; pte++; } return err; } static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t *pte; int err = 0; spinlock_t *ptl; if (walk->no_vma) { /* * pte_offset_map() might apply user-specific validation. * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. */ if (walk->mm == &init_mm || addr >= TASK_SIZE) pte = pte_offset_kernel(pmd, addr); else pte = pte_offset_map(pmd, addr); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); if (walk->mm != &init_mm && addr < TASK_SIZE) pte_unmap(pte); } } else { pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); pte_unmap_unlock(pte, ptl); } } if (!pte) walk->action = ACTION_AGAIN; return err; } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; int depth = real_depth(3); pmd = pmd_offset(pud, addr); do { again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; continue; } walk->action = ACTION_SUBTREE; /* * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds */ if (ops->pmd_entry) err = ops->pmd_entry(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; /* * Check this here so we only break down trans_huge * pages when we _need_ to */ if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || walk->action == ACTION_CONTINUE || !(ops->pte_entry)) continue; if (walk->vma) split_huge_pmd(walk->vma, pmd, addr); err = walk_pte_range(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; } while (pmd++, addr = next, addr != end); return err; } static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { pud_t *pud; unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; int depth = real_depth(2); pud = pud_offset(p4d, addr); do { again: next = pud_addr_end(addr, end); if (pud_none(*pud)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; continue; } walk->action = ACTION_SUBTREE; if (ops->pud_entry) err = ops->pud_entry(pud, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || walk->action == ACTION_CONTINUE || !(ops->pmd_entry || ops->pte_entry)) continue; if (walk->vma) split_huge_pud(walk->vma, pud, addr); if (pud_none(*pud)) goto again; err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, struct mm_walk *walk) { p4d_t *p4d; unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; int depth = real_depth(1); p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; continue; } if (ops->p4d_entry) { err = ops->p4d_entry(p4d, addr, next, walk); if (err) break; } if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) err = walk_pud_range(p4d, addr, next, walk); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int walk_pgd_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { pgd_t *pgd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; if (walk->pgd) pgd = walk->pgd + pgd_index(addr); else pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, 0, walk); if (err) break; continue; } if (ops->pgd_entry) { err = ops->pgd_entry(pgd, addr, next, walk); if (err) break; } if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); return err; } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) { unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); return boundary < end ? boundary : end; } static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); pte_t *pte; const struct mm_walk_ops *ops = walk->ops; int err = 0; hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); pte = hugetlb_walk(vma, addr & hmask, sz); if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) err = ops->pte_hole(addr, next, -1, walk); if (err) break; } while (addr = next, addr != end); hugetlb_vma_unlock_read(vma); return err; } #else /* CONFIG_HUGETLB_PAGE */ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ /* * Decide whether we really walk over the current vma on [@start, @end) * or skip it via the returned value. Return 0 if we do walk over the * current vma, and return 1 if we skip the vma. Negative values means * error, where we abort the current walk. */ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; if (ops->test_walk) return ops->test_walk(start, end, walk); /* * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP * range, so we don't walk over it as we do for normal vmas. However, * Some callers are interested in handling hole range and they don't * want to just ignore any single address range. Such users certainly * define their ->pte_hole() callbacks, so let's delegate them to handle * vma(VM_PFNMAP). */ if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (ops->pte_hole) err = ops->pte_hole(start, end, -1, walk); return err ? err : 1; } return 0; } static int __walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { int err = 0; struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); if (err) return err; } if (is_vm_hugetlb_page(vma)) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); if (ops->post_vma) ops->post_vma(walk); return err; } static inline void process_mm_walk_lock(struct mm_struct *mm, enum page_walk_lock walk_lock) { if (walk_lock == PGWALK_RDLOCK) mmap_assert_locked(mm); else mmap_assert_write_locked(mm); } static inline void process_vma_walk_lock(struct vm_area_struct *vma, enum page_walk_lock walk_lock) { #ifdef CONFIG_PER_VMA_LOCK switch (walk_lock) { case PGWALK_WRLOCK: vma_start_write(vma); break; case PGWALK_WRLOCK_VERIFY: vma_assert_write_locked(vma); break; case PGWALK_RDLOCK: /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ break; } #endif } /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @private: private data for callbacks' usage * * Recursively walk the page table tree of the process represented by @mm * within the virtual address range [@start, @end). During walking, we can do * some caller-specific works for each entry, by setting up pmd_entry(), * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these * callbacks, the associated entries/pages are just ignored. * The return values of these callbacks are commonly defined like below: * * - 0 : succeeded to handle the current entry, and if you don't reach the * end address yet, continue to walk. * - >0 : succeeded to handle the current entry, and return to the caller * with caller specific value. * - <0 : failed to handle the current entry, and return to the caller * with error code. * * Before starting to walk page table, some callers want to check whether * they really want to walk over the current vma, typically by checking * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * * If operations need to be staged before and committed after a vma is walked, * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), * since it is intended to handle commit-type operations, can't return any * errors. * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some * caller-specific data to callbacks, @private should be helpful. * * Locking: * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, * because these function traverse vma list and/or access to vma's data. */ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { int err = 0; unsigned long next; struct vm_area_struct *vma; struct mm_walk walk = { .ops = ops, .mm = mm, .private = private, }; if (start >= end) return -EINVAL; if (!walk.mm) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); vma = find_vma(walk.mm, start); do { if (!vma) { /* after the last vma */ walk.vma = NULL; next = end; if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else if (start < vma->vm_start) { /* outside vma */ walk.vma = NULL; next = min(end, vma->vm_start); if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ process_vma_walk_lock(vma, ops->walk_lock); walk.vma = vma; next = min(end, vma->vm_end); vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { /* * positive return values are purely for * controlling the pagewalk, so should never * be passed to the callers. */ err = 0; continue; } if (err < 0) break; err = __walk_page_range(start, next, &walk); } if (err) break; } while (start = next, start < end); return err; } /** * walk_page_range_novma - walk a range of pagetables not backed by a vma * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @pgd: pgd to walk if different from mm->pgd * @private: private data for callbacks' usage * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for * walking the kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approache (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, .mm = mm, .pgd = pgd, .private = private, .no_vma = true }; if (start >= end || !walk.mm) return -EINVAL; /* * 1) For walking the user virtual address space: * * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should * be hold. * * 2) For walking the kernel virtual address space: * * The kernel intermediate page tables usually do not be freed, so * the mmap map read lock is sufficient. But there are some exceptions. * E.g. memory hot-remove. In which case, the mmap lock is insufficient * to prevent the intermediate kernel pages tables belonging to the * specified address range from being freed. The caller should take * other actions to prevent this race. */ if (mm == &init_mm) mmap_assert_locked(walk.mm); else mmap_assert_write_locked(walk.mm); return walk_pgd_range(start, end, &walk); } int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (start >= end || !walk.mm) return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (!walk.mm) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } /** * walk_page_mapping - walk all memory areas mapped into a struct address_space. * @mapping: Pointer to the struct address_space * @first_index: First page offset in the address_space * @nr: Number of incremental page offsets to cover * @ops: operation to call during the walk * @private: private data for callbacks' usage * * This function walks all memory areas mapped into a struct address_space. * The walk is limited to only the given page-size index range, but if * the index boundaries cross a huge page-table entry, that entry will be * included. * * Also see walk_page_range() for additional information. * * Locking: * This function can't require that the struct mm_struct::mmap_lock is held, * since @mapping may be mapped by multiple processes. Instead * @mapping->i_mmap_rwsem must be held. This might have implications in the * callbacks, and it's up tho the caller to ensure that the * struct mm_struct::mmap_lock is not needed. * * Also this means that a caller can't rely on the struct * vm_area_struct::vm_flags to be constant across a call, * except for immutable flags. Callers requiring this shouldn't use * this function. * * Return: 0 on success, negative error code on failure, positive number on * caller defined premature termination. */ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .private = private, }; struct vm_area_struct *vma; pgoff_t vba, vea, cba, cea; unsigned long start_addr, end_addr; int err = 0; lockdep_assert_held(&mapping->i_mmap_rwsem); vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, first_index + nr - 1) { /* Clip to the vma */ vba = vma->vm_pgoff; vea = vba + vma_pages(vma); cba = first_index; cba = max(cba, vba); cea = first_index + nr; cea = min(cea, vea); start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; if (start_addr >= end_addr) continue; walk.vma = vma; walk.mm = vma->vm_mm; err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) { err = 0; break; } else if (err < 0) break; err = __walk_page_range(start_addr, end_addr, &walk); if (err) break; } return err; } /** * folio_walk_start - walk the page tables to a folio * @fw: filled with information on success. * @vma: the VMA. * @addr: the virtual address to use for the page table walk. * @flags: flags modifying which folios to walk to. * * Walk the page tables using @addr in a given @vma to a mapped folio and * return the folio, making sure that the page table entry referenced by * @addr cannot change until folio_walk_end() was called. * * As default, this function returns only folios that are not special (e.g., not * the zeropage) and never returns folios that are supposed to be ignored by the * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * * As default, this function only considers present page table entries. * If requested, it will also consider migration entries. * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * * On success, @fw is filled and the function returns the folio while the PTL * is still held and folio_walk_end() must be called to clean up, * releasing any held locks. The returned folio must *not* be used after the * call to folio_walk_end(), unless a short-term folio reference is taken before * that call. * * @fw->page will correspond to the page that is effectively referenced by * @addr. However, for migration entries and shared zeropages @fw->page is * set to NULL. Note that large folios might be mapped by multiple page table * entries, and this function will always only lookup a single entry as * specified by @addr, which might or might not cover more than a single page of * the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or * to carelessly modify page content. This function may *only* be used to grab * short-term folio references, never to grab long-term folio references. * * Using the page table entry pointers in @fw for reading or modifying the * entry should be avoided where possible: however, there might be valid * use cases. * * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. * For example, PMD page table sharing might require prior unsharing. Also, * logical hugetlb entries might span multiple physical page table entries, * which *must* be modified in a single operation (set_huge_pte_at(), * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might * not correspond to the first physical entry of a logical hugetlb entry. * * The mmap lock must be held in read mode. * * Return: folio pointer on success, otherwise NULL. */ struct folio *folio_walk_start(struct folio_walk *fw, struct vm_area_struct *vma, unsigned long addr, folio_walk_flags_t flags) { unsigned long entry_size; bool expose_page = true; struct page *page; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; spinlock_t *ptl; pgd_t *pgdp; p4d_t *p4dp; mmap_assert_locked(vma->vm_mm); vma_pgtable_walk_begin(vma); if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) goto not_found; pgdp = pgd_offset(vma->vm_mm, addr); if (pgd_none_or_clear_bad(pgdp)) goto not_found; p4dp = p4d_offset(pgdp, addr); if (p4d_none_or_clear_bad(p4dp)) goto not_found; pudp = pud_offset(p4dp, addr); pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); entry_size = PUD_SIZE; fw->level = FW_LEVEL_PUD; fw->pudp = pudp; fw->pud = pud; if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { spin_unlock(ptl); goto not_found; } else if (!pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; } /* * TODO: vm_normal_page_pud() will be handy once we want to * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. */ page = pud_page(pud); goto found; } pmd_table: VM_WARN_ON_ONCE(pud_leaf(*pudp)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); entry_size = PMD_SIZE; fw->level = FW_LEVEL_PMD; fw->pmdp = pmdp; fw->pmd = pmd; if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; } else if (!pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { page = vm_normal_page_pmd(vma, addr, pmd); if (page) { goto found; } else if ((flags & FW_ZEROPAGE) && is_huge_zero_pmd(pmd)) { page = pfn_to_page(pmd_pfn(pmd)); expose_page = false; goto found; } } else if ((flags & FW_MIGRATION) && is_pmd_migration_entry(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); page = pfn_swap_entry_to_page(entry); expose_page = false; goto found; } spin_unlock(ptl); goto not_found; } pte_table: VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; pte = ptep_get(ptep); entry_size = PAGE_SIZE; fw->level = FW_LEVEL_PTE; fw->ptep = ptep; fw->pte = pte; if (pte_present(pte)) { page = vm_normal_page(vma, addr, pte); if (page) goto found; if ((flags & FW_ZEROPAGE) && is_zero_pfn(pte_pfn(pte))) { page = pfn_to_page(pte_pfn(pte)); expose_page = false; goto found; } } else if (!pte_none(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); if ((flags & FW_MIGRATION) && is_migration_entry(entry)) { page = pfn_swap_entry_to_page(entry); expose_page = false; goto found; } } pte_unmap_unlock(ptep, ptl); not_found: vma_pgtable_walk_end(vma); return NULL; found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; return page_folio(page); }
93 93 94 153 155 151 154 154 155 155 51 127 2 128 1 125 127 1 124 2 152 2 150 128 128 128 128 128 154 152 4 4 150 148 154 51 51 51 51 51 51 51 53 53 53 53 2 1 51 51 51 52 1 1 1 1 1 1 1 1 1 52 55 3 53 53 53 1 52 1 51 51 51 51 51 51 1 52 53 56 52 53 53 5 51 153 153 1 1 152 153 2 151 91 74 56 56 1 51 150 153 2 152 166 166 133 4 4 94 122 4 4 4 4 4 4 4 122 122 121 5 1 4 4 4 4 5 4 5 122 94 94 82 93 94 92 91 2 2 2 2 94 99 76 1 97 151 151 146 139 8 3 67 5 5 5 63 19 8 5 8 150 146 136 138 77 39 66 89 90 90 90 139 138 138 138 137 88 136 138 137 137 138 137 139 137 87 137 137 137 137 135 137 136 136 136 108 137 133 132 132 132 89 90 48 89 90 77 89 1 88 1 90 111 109 111 108 108 74 73 111 137 137 136 137 87 108 102 29 1 28 28 28 28 28 28 28 28 133 133 5 129 129 15 1 1 1 130 14 127 127 127 127 127 127 126 127 127 127 127 130 127 126 19 133 144 144 144 144 144 2 143 29 1 133 133 127 87 94 19 6 14 13 14 14 5 13 28 69 73 8 142 42 41 42 42 1 41 42 27 27 77 5 5 91 91 91 91 91 91 5 86 86 88 87 2 11 11 10 11 3 3 3 1 105 103 105 83 83 82 94 94 87 87 87 86 16 16 4 4 104 75 15 15 15 15 15 15 15 3 15 15 15 15 3 15 15 15 15 15 15 15 3 15 15 15 15 15 12 11 11 11 3 3 3 3 3 3 3 1 64 64 63 64 64 21 2 64 58 3 2 56 5 1 63 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 // SPDX-License-Identifier: GPL-2.0-or-later /* * suballoc.c * * metadata alloc and free * Inspired by ext3 block groups. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "blockcheck.h" #include "dlmglue.h" #include "inode.h" #include "journal.h" #include "localalloc.h" #include "suballoc.h" #include "super.h" #include "sysfile.h" #include "uptodate.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" #define NOT_ALLOC_NEW_GROUP 0 #define ALLOC_NEW_GROUP 0x1 #define ALLOC_GROUPS_FROM_GLOBAL 0x2 #define OCFS2_MAX_TO_STEAL 1024 struct ocfs2_suballoc_result { u64 sr_bg_blkno; /* The bg we allocated from. Set to 0 when a block group is contiguous. */ u64 sr_bg_stable_blkno; /* * Doesn't change, always * set to target block * group descriptor * block. */ u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ unsigned int sr_max_contig_bits; /* The length for contiguous * free bits, only available * for cluster group */ }; static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) { if (res->sr_blkno == 0) return 0; if (res->sr_bg_blkno) return res->sr_bg_blkno; return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); } static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, struct buffer_head *bh, u64 max_block, u64 *last_alloc_group, int flags); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, struct ocfs2_suballoc_result *res); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, struct ocfs2_suballoc_result *res); static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, struct buffer_head *bg_bh, struct buffer_head *prev_bg_bh, u16 chain); static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, u32 wanted); static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, u64 bg_blkno, u16 bg_bit_off); static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, u16 *bg_bit_off); static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, u32 bits_wanted, u64 max_block, int flags, struct ocfs2_alloc_context **ac); void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) { struct inode *inode = ac->ac_inode; if (inode) { if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); inode_unlock(inode); iput(inode); ac->ac_inode = NULL; } brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; kfree(ac->ac_find_loc_priv); ac->ac_find_loc_priv = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) { ocfs2_free_ac_resource(ac); kfree(ac); } static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) { return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); } #define do_error(fmt, ...) \ do { \ if (resize) \ mlog(ML_ERROR, fmt, ##__VA_ARGS__); \ else \ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \ } while (0) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, int resize) { struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { do_error("Group descriptor #%llu has bad signature %.*s\n", (unsigned long long)bh->b_blocknr, 7, gd->bg_signature); } if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_blkno)); } if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n", (unsigned long long)bh->b_blocknr, le32_to_cpu(gd->bg_generation)); } if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), le16_to_cpu(gd->bg_free_bits_count)); } if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits), 8 * le16_to_cpu(gd->bg_size)); } return 0; } static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_dinode *di, struct buffer_head *bh, int resize) { unsigned int max_bits; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (di->i_blkno != gd->bg_parent_dinode) { do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n", (unsigned long long)bh->b_blocknr, (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), (unsigned long long)le64_to_cpu(di->i_blkno)); } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { do_error("Group descriptor #%llu has bit count of %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_bits)); } /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ if ((le16_to_cpu(gd->bg_chain) > le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || ((le16_to_cpu(gd->bg_chain) == le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { do_error("Group descriptor #%llu has bad chain %u\n", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); } return 0; } #undef do_error /* * This version only prints errors. It does not fail the filesystem, and * exists only for resize. */ int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, struct buffer_head *bh) { int rc; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; BUG_ON(!buffer_uptodate(bh)); /* * If the ecc fails, we return the error but otherwise * leave the filesystem running. We know any error is * local to this block. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); if (rc) { mlog(ML_ERROR, "Checksum failed for group descriptor %llu\n", (unsigned long long)bh->b_blocknr); } else rc = ocfs2_validate_gd_self(sb, bh, 1); if (!rc) rc = ocfs2_validate_gd_parent(sb, di, bh, 1); return rc; } static int ocfs2_validate_group_descriptor(struct super_block *sb, struct buffer_head *bh) { int rc; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; trace_ocfs2_validate_group_descriptor( (unsigned long long)bh->b_blocknr); BUG_ON(!buffer_uptodate(bh)); /* * If the ecc fails, we return the error but otherwise * leave the filesystem running. We know any error is * local to this block. */ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); if (rc) return rc; /* * Errors after here are fatal. */ return ocfs2_validate_gd_self(sb, bh, 0); } int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, u64 gd_blkno, struct buffer_head **bh) { int rc; struct buffer_head *tmp = *bh; rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, ocfs2_validate_group_descriptor); if (rc) goto out; rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); if (rc) { brelse(tmp); goto out; } /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!*bh) *bh = tmp; out: return rc; } static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, struct ocfs2_group_desc *bg, struct ocfs2_chain_list *cl, u64 p_blkno, unsigned int clusters) { struct ocfs2_extent_list *el = &bg->bg_list; struct ocfs2_extent_rec *rec; BUG_ON(!ocfs2_supports_discontig_bg(osb)); if (!el->l_next_free_rec) el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; rec->e_blkno = cpu_to_le64(p_blkno); rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc)); rec->e_leaf_clusters = cpu_to_le16(clusters); le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); le16_add_cpu(&bg->bg_free_bits_count, clusters * le16_to_cpu(cl->cl_bpc)); le16_add_cpu(&el->l_next_free_rec, 1); } static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl) { int status = 0; struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct super_block * sb = alloc_inode->i_sb; if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { status = ocfs2_error(alloc_inode->i_sb, "group block (%llu) != b_blocknr (%llu)\n", (unsigned long long)group_blkno, (unsigned long long) bg_bh->b_blocknr); goto bail; } status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); bg->bg_generation = cpu_to_le32(osb->fs_generation); bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); bg->bg_blkno = cpu_to_le64(group_blkno); if (group_clusters == le16_to_cpu(cl->cl_cpg)) bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); else ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, group_clusters); /* set the 1st bit in the bitmap to account for the descriptor block */ ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); ocfs2_journal_dirty(handle, bg_bh); /* There is no need to zero out or otherwise initialize the * other blocks in a group - All valid FS metadata in a block * group stores the superblock fs_generation value at * allocation time. */ bail: if (status) mlog_errno(status); return status; } static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) { u16 curr, best; best = curr = 0; while (curr < le16_to_cpu(cl->cl_count)) { if (le32_to_cpu(cl->cl_recs[best].c_total) > le32_to_cpu(cl->cl_recs[curr].c_total)) best = curr; curr++; } return best; } static struct buffer_head * ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, struct inode *alloc_inode, struct ocfs2_alloc_context *ac, struct ocfs2_chain_list *cl) { int status; u32 bit_off, num_bits; u64 bg_blkno; struct buffer_head *bg_bh; unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); status = ocfs2_claim_clusters(handle, ac, le16_to_cpu(cl->cl_cpg), &bit_off, &num_bits); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } /* setup the group */ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); trace_ocfs2_block_group_alloc_contig( (unsigned long long)bg_blkno, alloc_rec); bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { status = -ENOMEM; mlog_errno(status); goto bail; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, bg_blkno, num_bits, alloc_rec, cl); if (status < 0) { brelse(bg_bh); mlog_errno(status); } bail: return status ? ERR_PTR(status) : bg_bh; } static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, handle_t *handle, struct ocfs2_alloc_context *ac, unsigned int min_bits, u32 *bit_off, u32 *num_bits) { int status = 0; while (min_bits) { status = ocfs2_claim_clusters(handle, ac, min_bits, bit_off, num_bits); if (status != -ENOSPC) break; min_bits >>= 1; } return status; } static int ocfs2_block_group_grow_discontig(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, struct ocfs2_alloc_context *ac, struct ocfs2_chain_list *cl, unsigned int min_bits) { int status; struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *)bg_bh->b_data; unsigned int needed = le16_to_cpu(cl->cl_cpg) - le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); u32 p_cpos, clusters; u64 p_blkno; struct ocfs2_extent_list *el = &bg->bg_list; status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))) { if (min_bits > needed) min_bits = needed; status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, &p_cpos, &clusters); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, clusters); min_bits = clusters; needed = le16_to_cpu(cl->cl_cpg) - le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); } if (needed > 0) { /* * We have used up all the extent rec but can't fill up * the cpg. So bail out. */ status = -ENOSPC; goto bail; } ocfs2_journal_dirty(handle, bg_bh); bail: return status; } static void ocfs2_bg_alloc_cleanup(handle_t *handle, struct ocfs2_alloc_context *cluster_ac, struct inode *alloc_inode, struct buffer_head *bg_bh) { int i, ret; struct ocfs2_group_desc *bg; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; if (!bg_bh) return; bg = (struct ocfs2_group_desc *)bg_bh->b_data; el = &bg->bg_list; for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, cluster_ac->ac_bh, le64_to_cpu(rec->e_blkno), le16_to_cpu(rec->e_leaf_clusters)); if (ret) mlog_errno(ret); /* Try all the clusters to free */ } ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); brelse(bg_bh); } static struct buffer_head * ocfs2_block_group_alloc_discontig(handle_t *handle, struct inode *alloc_inode, struct ocfs2_alloc_context *ac, struct ocfs2_chain_list *cl) { int status; u32 bit_off, num_bits; u64 bg_blkno; unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; struct buffer_head *bg_bh = NULL; unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); if (!ocfs2_supports_discontig_bg(osb)) { status = -ENOSPC; goto bail; } status = ocfs2_extend_trans(handle, ocfs2_calc_bg_discontig_credits(osb->sb)); if (status) { mlog_errno(status); goto bail; } /* * We're going to be grabbing from multiple cluster groups. * We don't have enough credits to relink them all, and the * cluster groups will be staying in cache for the duration of * this operation. */ ac->ac_disable_chain_relink = 1; /* Claim the first region */ status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, &bit_off, &num_bits); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } min_bits = num_bits; /* setup the group */ bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); trace_ocfs2_block_group_alloc_discontig( (unsigned long long)bg_blkno, alloc_rec); bg_bh = sb_getblk(osb->sb, bg_blkno); if (!bg_bh) { status = -ENOMEM; mlog_errno(status); goto bail; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, bg_blkno, num_bits, alloc_rec, cl); if (status < 0) { mlog_errno(status); goto bail; } status = ocfs2_block_group_grow_discontig(handle, alloc_inode, bg_bh, ac, cl, min_bits); if (status) mlog_errno(status); bail: if (status) ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); return status ? ERR_PTR(status) : bg_bh; } /* * We expect the block group allocator to already be locked. */ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, struct buffer_head *bh, u64 max_block, u64 *last_alloc_group, int flags) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; struct ocfs2_chain_list *cl; struct ocfs2_alloc_context *ac = NULL; handle_t *handle = NULL; u16 alloc_rec; struct buffer_head *bg_bh = NULL; struct ocfs2_group_desc *bg; BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); cl = &fe->id2.i_chain; status = ocfs2_reserve_clusters_with_limit(osb, le16_to_cpu(cl->cl_cpg), max_block, flags, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } credits = ocfs2_calc_group_alloc_credits(osb->sb, le16_to_cpu(cl->cl_cpg)); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; mlog_errno(status); goto bail; } if (last_alloc_group && *last_alloc_group != 0) { trace_ocfs2_block_group_alloc( (unsigned long long)*last_alloc_group); ac->ac_last_group = *last_alloc_group; } bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); if (PTR_ERR(bg_bh) == -ENOSPC) bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); if (IS_ERR(bg_bh)) { status = PTR_ERR(bg_bh); bg_bh = NULL; if (status != -ENOSPC) mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) bg_bh->b_data; status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } alloc_rec = le16_to_cpu(bg->bg_chain); le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, le16_to_cpu(bg->bg_free_bits_count)); le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - le16_to_cpu(bg->bg_free_bits_count)); le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(alloc_inode)->ip_lock); OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, le32_to_cpu(fe->i_clusters))); spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; /* save the new last alloc group so that the caller can cache it. */ if (last_alloc_group) *last_alloc_group = ac->ac_last_group; bail: if (handle) ocfs2_commit_trans(osb, handle); if (ac) ocfs2_free_alloc_context(ac); brelse(bg_bh); if (status) mlog_errno(status); return status; } static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, int type, u32 slot, u64 *last_alloc_group, int flags) { int status; u32 bits_wanted = ac->ac_bits_wanted; struct inode *alloc_inode; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe; u32 free_bits; alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); if (!alloc_inode) { mlog_errno(-EINVAL); return -EINVAL; } inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); return status; } ac->ac_inode = alloc_inode; ac->ac_alloc_slot = slot; fe = (struct ocfs2_dinode *) bh->b_data; /* The bh was validated by the inode read inside * ocfs2_inode_lock(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { status = ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu\n", (unsigned long long)le64_to_cpu(fe->i_blkno)); goto bail; } free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - le32_to_cpu(fe->id1.bitmap1.i_used); if (bits_wanted > free_bits) { /* cluster bitmap never grows */ if (ocfs2_is_cluster_bitmap(alloc_inode)) { trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, free_bits); status = -ENOSPC; goto bail; } if (!(flags & ALLOC_NEW_GROUP)) { trace_ocfs2_reserve_suballoc_bits_no_new_group( slot, bits_wanted, free_bits); status = -ENOSPC; goto bail; } status = ocfs2_block_group_alloc(osb, alloc_inode, bh, ac->ac_max_block, last_alloc_group, flags); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } atomic_inc(&osb->alloc_stats.bg_extends); /* You should never ask for this much metadata */ BUG_ON(bits_wanted > (le32_to_cpu(fe->id1.bitmap1.i_total) - le32_to_cpu(fe->id1.bitmap1.i_used))); } get_bh(bh); ac->ac_bh = bh; bail: brelse(bh); if (status) mlog_errno(status); return status; } static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) { spin_lock(&osb->osb_lock); osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; spin_unlock(&osb->osb_lock); atomic_set(&osb->s_num_inodes_stolen, 0); } static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) { spin_lock(&osb->osb_lock); osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; spin_unlock(&osb->osb_lock); atomic_set(&osb->s_num_meta_stolen, 0); } void ocfs2_init_steal_slots(struct ocfs2_super *osb) { ocfs2_init_inode_steal_slot(osb); ocfs2_init_meta_steal_slot(osb); } static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) { spin_lock(&osb->osb_lock); if (type == INODE_ALLOC_SYSTEM_INODE) osb->s_inode_steal_slot = (u16)slot; else if (type == EXTENT_ALLOC_SYSTEM_INODE) osb->s_meta_steal_slot = (u16)slot; spin_unlock(&osb->osb_lock); } static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) { int slot = OCFS2_INVALID_SLOT; spin_lock(&osb->osb_lock); if (type == INODE_ALLOC_SYSTEM_INODE) slot = osb->s_inode_steal_slot; else if (type == EXTENT_ALLOC_SYSTEM_INODE) slot = osb->s_meta_steal_slot; spin_unlock(&osb->osb_lock); return slot; } static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) { return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); } static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) { return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); } static int ocfs2_steal_resource(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, int type) { int i, status = -ENOSPC; int slot = __ocfs2_get_steal_slot(osb, type); /* Start to steal resource from the first slot after ours. */ if (slot == OCFS2_INVALID_SLOT) slot = osb->slot_num + 1; for (i = 0; i < osb->max_slots; i++, slot++) { if (slot == osb->max_slots) slot = 0; if (slot == osb->slot_num) continue; status = ocfs2_reserve_suballoc_bits(osb, ac, type, (u32)slot, NULL, NOT_ALLOC_NEW_GROUP); if (status >= 0) { __ocfs2_set_steal_slot(osb, slot, type); break; } ocfs2_free_ac_resource(ac); } return status; } static int ocfs2_steal_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); } static int ocfs2_steal_meta(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); } int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, int blocks, struct ocfs2_alloc_context **ac) { int status; int slot = ocfs2_get_meta_steal_slot(osb); *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); goto bail; } (*ac)->ac_bits_wanted = blocks; (*ac)->ac_which = OCFS2_AC_USE_META; (*ac)->ac_group_search = ocfs2_block_group_search; if (slot != OCFS2_INVALID_SLOT && atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) goto extent_steal; atomic_set(&osb->s_num_meta_stolen, 0); status = ocfs2_reserve_suballoc_bits(osb, (*ac), EXTENT_ALLOC_SYSTEM_INODE, (u32)osb->slot_num, NULL, ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); if (status >= 0) { status = 0; if (slot != OCFS2_INVALID_SLOT) ocfs2_init_meta_steal_slot(osb); goto bail; } else if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; } ocfs2_free_ac_resource(*ac); extent_steal: status = ocfs2_steal_meta(osb, *ac); atomic_inc(&osb->s_num_meta_stolen); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } status = 0; bail: if ((status < 0) && *ac) { ocfs2_free_alloc_context(*ac); *ac = NULL; } if (status) mlog_errno(status); return status; } int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, struct ocfs2_extent_list *root_el, struct ocfs2_alloc_context **ac) { return ocfs2_reserve_new_metadata_blocks(osb, ocfs2_extend_meta_needed(root_el), ac); } int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac) { int status; int slot = ocfs2_get_inode_steal_slot(osb); u64 alloc_group; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); goto bail; } (*ac)->ac_bits_wanted = 1; (*ac)->ac_which = OCFS2_AC_USE_INODE; (*ac)->ac_group_search = ocfs2_block_group_search; /* * stat(2) can't handle i_ino > 32bits, so we tell the * lower levels not to allocate us a block group past that * limit. The 'inode64' mount option avoids this behavior. */ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) (*ac)->ac_max_block = (u32)~0U; /* * slot is set when we successfully steal inode from other nodes. * It is reset in 3 places: * 1. when we flush the truncate log * 2. when we complete local alloc recovery. * 3. when we successfully allocate from our own slot. * After it is set, we will go on stealing inodes until we find the * need to check our slots to see whether there is some space for us. */ if (slot != OCFS2_INVALID_SLOT && atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) goto inode_steal; atomic_set(&osb->s_num_inodes_stolen, 0); alloc_group = osb->osb_inode_alloc_group; status = ocfs2_reserve_suballoc_bits(osb, *ac, INODE_ALLOC_SYSTEM_INODE, (u32)osb->slot_num, &alloc_group, ALLOC_NEW_GROUP | ALLOC_GROUPS_FROM_GLOBAL); if (status >= 0) { status = 0; spin_lock(&osb->osb_lock); osb->osb_inode_alloc_group = alloc_group; spin_unlock(&osb->osb_lock); trace_ocfs2_reserve_new_inode_new_group( (unsigned long long)alloc_group); /* * Some inodes must be freed by us, so try to allocate * from our own next time. */ if (slot != OCFS2_INVALID_SLOT) ocfs2_init_inode_steal_slot(osb); goto bail; } else if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; } ocfs2_free_ac_resource(*ac); inode_steal: status = ocfs2_steal_inode(osb, *ac); atomic_inc(&osb->s_num_inodes_stolen); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } status = 0; bail: if ((status < 0) && *ac) { ocfs2_free_alloc_context(*ac); *ac = NULL; } if (status) mlog_errno(status); return status; } /* local alloc code has to do the same thing, so rather than do this * twice.. */ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { int status; ac->ac_which = OCFS2_AC_USE_MAIN; ac->ac_group_search = ocfs2_cluster_group_search; status = ocfs2_reserve_suballoc_bits(osb, ac, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT, NULL, ALLOC_NEW_GROUP); if (status < 0 && status != -ENOSPC) mlog_errno(status); return status; } /* Callers don't need to care which bitmap (local alloc or main) to * use so we figure it out for them, but unfortunately this clutters * things a bit. */ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, u32 bits_wanted, u64 max_block, int flags, struct ocfs2_alloc_context **ac) { int status, ret = 0; int retried = 0; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; mlog_errno(status); goto bail; } (*ac)->ac_bits_wanted = bits_wanted; (*ac)->ac_max_block = max_block; status = -ENOSPC; if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; } } if (status == -ENOSPC) { retry: status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); /* Retry if there is sufficient space cached in truncate log */ if (status == -ENOSPC && !retried) { retried = 1; ocfs2_inode_unlock((*ac)->ac_inode, 1); inode_unlock((*ac)->ac_inode); ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); if (ret == 1) { iput((*ac)->ac_inode); (*ac)->ac_inode = NULL; goto retry; } if (ret < 0) mlog_errno(ret); inode_lock((*ac)->ac_inode); ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); if (ret < 0) { mlog_errno(ret); inode_unlock((*ac)->ac_inode); iput((*ac)->ac_inode); (*ac)->ac_inode = NULL; goto bail; } } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } } status = 0; bail: if ((status < 0) && *ac) { ocfs2_free_alloc_context(*ac); *ac = NULL; } if (status) mlog_errno(status); return status; } int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac) { return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ALLOC_NEW_GROUP, ac); } /* * More or less lifted from ext3. I'll leave their description below: * * "For ext3 allocations, we must not reuse any blocks which are * allocated in the bitmap buffer's "last committed data" copy. This * prevents deletes from freeing up the page for reuse until we have * committed the delete transaction. * * If we didn't do this, then deleting something and reallocating it as * data would allow the old block to be overwritten before the * transaction committed (because we force data to disk before commit). * This would lead to corruption if we crashed between overwriting the * data and committing the delete. * * @@@ We may want to make this allocation behaviour conditional on * data-writes at some point, and disable it for metadata allocations or * sync-data inodes." * * Note: OCFS2 already does this differently for metadata vs data * allocations, as those bitmaps are separate and undo access is never * called on a metadata group descriptor. */ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr) { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct journal_head *jh; int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; jh = jbd2_journal_grab_journal_head(bg_bh); if (!jh) return 1; spin_lock(&jh->b_state_lock); bg = (struct ocfs2_group_desc *) jh->b_committed_data; if (bg) ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); else ret = 1; spin_unlock(&jh->b_state_lock); jbd2_journal_put_journal_head(jh); return ret; } u16 ocfs2_find_max_contig_free_bits(void *bitmap, u16 total_bits, u16 start) { u16 offset, free_bits; u16 contig_bits = 0; while (start < total_bits) { offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); if (offset == total_bits) break; start = ocfs2_find_next_bit(bitmap, total_bits, offset); free_bits = start - offset; if (contig_bits < free_bits) contig_bits = free_bits; } return contig_bits; } static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, unsigned int total_bits, struct ocfs2_suballoc_result *res) { void *bitmap; u16 best_offset, best_size; u16 prev_best_size = 0; int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; /* Callers got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < total_bits) { if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { /* We found a zero, but we can't use it as it * hasn't been put to disk yet! */ found = 0; start = offset + 1; } else if (offset == start) { /* we found a zero */ found++; /* move start to the next bit to test */ start++; } else { /* got a zero after some ones */ found = 1; start = offset + 1; prev_best_size = best_size; } if (found > best_size) { best_size = found; best_offset = start - found; } /* we got everything we needed */ if (found == bits_wanted) { /* mlog(0, "Found it all!\n"); */ break; } } /* best_size will be allocated, we save prev_best_size */ res->sr_max_contig_bits = prev_best_size; if (best_size) { res->sr_bit_offset = best_offset; res->sr_bits = best_size; } else { status = -ENOSPC; /* No error log here -- see the comment above * ocfs2_test_bg_bit_allocatable */ } return status; } int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, unsigned int num_bits, unsigned int max_contig_bits, int fastpath) { int status; void *bitmap = bg->bg_bitmap; int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; unsigned int start = bit_off + num_bits; u16 contig_bits; struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); /* All callers get the descriptor via * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); trace_ocfs2_block_group_set_bits(bit_off, num_bits); if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), group_bh, journal_type); if (status < 0) { mlog_errno(status); goto bail; } le16_add_cpu(&bg->bg_free_bits_count, -num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), le16_to_cpu(bg->bg_free_bits_count), num_bits); } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); /* * this is optimize path, caller set old contig value * in max_contig_bits to bypass finding action. */ if (fastpath) { bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { /* * Usually, the block group bitmap allocates only 1 bit * at a time, while the cluster group allocates n bits * each time. Therefore, we only save the contig bits for * the cluster group. */ contig_bits = ocfs2_find_max_contig_free_bits(bitmap, le16_to_cpu(bg->bg_bits), start); if (contig_bits > max_contig_bits) max_contig_bits = contig_bits; bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); } else { bg->bg_contig_free_bits = 0; } ocfs2_journal_dirty(handle, group_bh); bail: return status; } /* find the one with the most empty bits */ static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) { u16 curr, best; BUG_ON(!cl->cl_next_free_rec); best = curr = 0; while (curr < le16_to_cpu(cl->cl_next_free_rec)) { if (le32_to_cpu(cl->cl_recs[curr].c_free) > le32_to_cpu(cl->cl_recs[best].c_free)) best = curr; curr++; } BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); return best; } static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, struct buffer_head *bg_bh, struct buffer_head *prev_bg_bh, u16 chain) { int status; /* there is a really tiny chance the journal calls could fail, * but we wouldn't want inconsistent blocks in *any* case. */ u64 bg_ptr, prev_bg_ptr; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; /* The caller got these descriptors from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); trace_ocfs2_relink_block_group( (unsigned long long)le64_to_cpu(fe->i_blkno), chain, (unsigned long long)le64_to_cpu(bg->bg_blkno), (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), prev_bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) goto out; prev_bg->bg_next_group = bg->bg_next_group; ocfs2_journal_dirty(handle, prev_bg_bh); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) goto out_rollback_prev_bg; bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; ocfs2_journal_dirty(handle, bg_bh); status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) goto out_rollback_bg; fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; ocfs2_journal_dirty(handle, fe_bh); out: if (status < 0) mlog_errno(status); return status; out_rollback_bg: bg->bg_next_group = cpu_to_le64(bg_ptr); out_rollback_prev_bg: prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); goto out; } static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, u32 wanted) { return le16_to_cpu(bg->bg_free_bits_count) > wanted; } /* return 0 on success, -ENOSPC to keep searching and any other < 0 * value on error. */ static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, struct ocfs2_suballoc_result *res) { int search = -ENOSPC; int ret; u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int max_bits, gd_cluster_off; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); if (le16_to_cpu(gd->bg_contig_free_bits) && le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) return -ENOSPC; /* ->bg_contig_free_bits may un-initialized, so compare again */ if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { max_bits = le16_to_cpu(gd->bg_bits); /* Tail groups in cluster bitmaps which aren't cpg * aligned are prone to partial extension by a failed * fs resize. If the file system resize never got to * update the dinode cluster count, then we don't want * to trust any clusters past it, regardless of what * the group descriptor says. */ gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, le64_to_cpu(gd->bg_blkno)); if ((gd_cluster_off + max_bits) > OCFS2_I(inode)->ip_clusters) { max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; trace_ocfs2_cluster_group_search_wrong_max_bits( (unsigned long long)le64_to_cpu(gd->bg_blkno), le16_to_cpu(gd->bg_bits), OCFS2_I(inode)->ip_clusters, max_bits); } ret = ocfs2_block_group_find_clear_bits(osb, group_bh, bits_wanted, max_bits, res); if (ret) return ret; if (max_block) { blkoff = ocfs2_clusters_to_blocks(inode->i_sb, gd_cluster_off + res->sr_bit_offset + res->sr_bits); trace_ocfs2_cluster_group_search_max_block( (unsigned long long)blkoff, (unsigned long long)max_block); if (blkoff > max_block) return -ENOSPC; } /* ocfs2_block_group_find_clear_bits() might * return success, but we still want to return * -ENOSPC unless it found the minimum number * of bits. */ if (min_bits <= res->sr_bits) search = 0; /* success */ } return search; } static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u64 max_block, struct ocfs2_suballoc_result *res) { int ret = -ENOSPC; u64 blkoff; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), res); if (!ret && max_block) { blkoff = le64_to_cpu(bg->bg_blkno) + res->sr_bit_offset + res->sr_bits; trace_ocfs2_block_group_search_max_block( (unsigned long long)blkoff, (unsigned long long)max_block); if (blkoff > max_block) ret = -ENOSPC; } } return ret; } int ocfs2_alloc_dinode_update_counts(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, u32 num_bits, u16 chain) { int ret; u32 tmp_used; struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; } tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); ocfs2_journal_dirty(handle, di_bh); out: return ret; } void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain) { u32 tmp_used; struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl; cl = (struct ocfs2_chain_list *)&di->id2.i_chain; tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); } static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, struct ocfs2_extent_rec *rec, struct ocfs2_chain_list *cl) { unsigned int bpc = le16_to_cpu(cl->cl_bpc); unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; if (res->sr_bit_offset < bitoff) return 0; if (res->sr_bit_offset >= (bitoff + bitcount)) return 0; res->sr_blkno = le64_to_cpu(rec->e_blkno) + (res->sr_bit_offset - bitoff); if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; return 1; } static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, struct ocfs2_group_desc *bg, struct ocfs2_suballoc_result *res) { int i; u64 bg_blkno = res->sr_bg_blkno; /* Save off */ struct ocfs2_extent_rec *rec; struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct ocfs2_chain_list *cl = &di->id2.i_chain; if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { res->sr_blkno = 0; return; } res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; res->sr_bg_blkno = 0; /* Clear it for contig block groups */ if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || !bg->bg_list.l_next_free_rec) return; for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { rec = &bg->bg_list.l_recs[i]; if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { res->sr_bg_blkno = bg_blkno; /* Restore */ break; } } } static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, struct ocfs2_suballoc_result *res, u16 *bits_left) { int ret; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *gd; struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct inode *alloc_inode = ac->ac_inode; ret = ocfs2_read_group_descriptor(alloc_inode, di, res->sr_bg_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; } gd = (struct ocfs2_group_desc *) group_bh->b_data; ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, ac->ac_max_block, res); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } if (!ret) ocfs2_bg_discontig_fix_result(ac, gd, res); /* * sr_bg_blkno might have been changed by * ocfs2_bg_discontig_fix_result */ res->sr_bg_stable_blkno = group_bh->b_blocknr; if (ac->ac_find_loc_only) goto out_loc_only; ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); if (ret < 0) { mlog_errno(ret); goto out; } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, res->sr_bit_offset, res->sr_bits, res->sr_max_contig_bits, 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); mlog_errno(ret); } out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); out: brelse(group_bh); return ret; } static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, struct ocfs2_suballoc_result *res, u16 *bits_left) { int status; u16 chain; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; struct buffer_head *prev_group_bh = NULL; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; struct ocfs2_group_desc *bg; chain = ac->ac_chain; trace_ocfs2_search_chain_begin( (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, bits_wanted, chain); status = ocfs2_read_group_descriptor(alloc_inode, fe, le64_to_cpu(cl->cl_recs[chain].c_blkno), &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, ac->ac_max_block, res)) == -ENOSPC) { if (!bg->bg_next_group) break; brelse(prev_group_bh); prev_group_bh = NULL; next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; status = ocfs2_read_group_descriptor(alloc_inode, fe, next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } trace_ocfs2_search_chain_succ( (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); BUG_ON(res->sr_bits == 0); if (!status) ocfs2_bg_discontig_fix_result(ac, bg, res); /* * sr_bg_blkno might have been changed by * ocfs2_bg_discontig_fix_result */ res->sr_bg_stable_blkno = group_bh->b_blocknr; /* * Keep track of previous block descriptor read. When * we find a target, if we have read more than X * number of descriptors, and the target is reasonably * empty, relink him to top of his chain. * * We've read 0 extra blocks and only send one more to * the transaction, yet the next guy to search has a * much easier time. * * Do this *after* figuring out how many bits we're taking out * of our target group. */ if (!ac->ac_disable_chain_relink && (prev_group_bh) && (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { status = ocfs2_relink_block_group(handle, alloc_inode, ac->ac_bh, group_bh, prev_group_bh, chain); if (status < 0) { mlog_errno(status); goto bail; } } if (ac->ac_find_loc_only) goto out_loc_only; status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, chain); if (status) { mlog_errno(status); goto bail; } status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, group_bh, res->sr_bit_offset, res->sr_bits, res->sr_max_contig_bits, 0); if (status < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; } trace_ocfs2_search_chain_end( (unsigned long long)le64_to_cpu(fe->i_blkno), res->sr_bits); out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); brelse(prev_group_bh); if (status) mlog_errno(status); return status; } /* will give out up to bits_wanted contiguous bits. */ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, struct ocfs2_suballoc_result *res) { int status; u16 victim, i; u16 bits_left = 0; u64 hint = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); BUG_ON(!ac->ac_bh); fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; /* The bh was validated by the inode read during * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { status = ocfs2_error(ac->ac_inode->i_sb, "Chain allocator dinode %llu has %u used bits but only %u total\n", (unsigned long long)le64_to_cpu(fe->i_blkno), le32_to_cpu(fe->id1.bitmap1.i_used), le32_to_cpu(fe->id1.bitmap1.i_total)); goto bail; } res->sr_bg_blkno = hint; if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used * allocation group. This helps us maintain some * contiguousness across allocations. */ status = ocfs2_search_one_group(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) goto set_hint; if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; } } cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { if (ocfs2_is_cluster_bitmap(ac->ac_inode)) hint = res->sr_bg_blkno; else hint = ocfs2_group_from_res(res); goto set_hint; } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; } trace_ocfs2_claim_suballoc_bits(victim); /* If we didn't pick a good victim, then just default to * searching each chain in order. Don't allow chain relinking * because we only calculate enough journal credits for one * relink per alloc. */ ac->ac_disable_chain_relink = 1; for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) continue; ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { hint = ocfs2_group_from_res(res); break; } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; } } set_hint: if (status != -ENOSPC) { /* If the next search of this group is not likely to * yield a suitable extent, then we reset the last * group hint so as to not waste a disk read */ if (bits_left < min_bits) ac->ac_last_group = 0; else ac->ac_last_group = hint; } bail: if (status) mlog_errno(status); return status; } int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, u64 *suballoc_loc, u16 *suballoc_bit_start, unsigned int *num_bits, u64 *blkno_start) { int status; struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; BUG_ON(!ac); BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); BUG_ON(ac->ac_which != OCFS2_AC_USE_META); status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, 1, &res); if (status < 0) { mlog_errno(status); goto bail; } atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); *suballoc_loc = res.sr_bg_blkno; *suballoc_bit_start = res.sr_bit_offset; *blkno_start = res.sr_blkno; ac->ac_bits_given += res.sr_bits; *num_bits = res.sr_bits; status = 0; bail: if (status) mlog_errno(status); return status; } static void ocfs2_init_inode_ac_group(struct inode *dir, struct buffer_head *parent_di_bh, struct ocfs2_alloc_context *ac) { struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; /* * Try to allocate inodes from some specific group. * * If the parent dir has recorded the last group used in allocation, * cool, use it. Otherwise if we try to allocate new inode from the * same slot the parent dir belongs to, use the same chunk. * * We are very careful here to avoid the mistake of setting * ac_last_group to a group descriptor from a different (unlocked) slot. */ if (OCFS2_I(dir)->ip_last_used_group && OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { if (di->i_suballoc_loc) ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); else ac->ac_last_group = ocfs2_which_suballoc_group( le64_to_cpu(di->i_blkno), le16_to_cpu(di->i_suballoc_bit)); } } static inline void ocfs2_save_inode_ac_group(struct inode *dir, struct ocfs2_alloc_context *ac) { OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } int ocfs2_find_new_inode_loc(struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, u64 *fe_blkno) { int ret; handle_t *handle = NULL; struct ocfs2_suballoc_result *res; BUG_ON(!ac); BUG_ON(ac->ac_bits_given != 0); BUG_ON(ac->ac_bits_wanted != 1); BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); res = kzalloc(sizeof(*res), GFP_NOFS); if (res == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; } ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); /* * The handle started here is for chain relink. Alternatively, * we could just disable relink for these calls. */ handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); if (IS_ERR(handle)) { ret = PTR_ERR(handle); handle = NULL; mlog_errno(ret); goto out; } /* * This will instruct ocfs2_claim_suballoc_bits and * ocfs2_search_one_group to search but save actual allocation * for later. */ ac->ac_find_loc_only = 1; ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); if (ret < 0) { mlog_errno(ret); goto out; } ac->ac_find_loc_priv = res; *fe_blkno = res->sr_blkno; ocfs2_update_inode_fsync_trans(handle, dir, 0); out: if (handle) ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); if (ret) kfree(res); return ret; } int ocfs2_claim_new_inode_at_loc(handle_t *handle, struct inode *dir, struct ocfs2_alloc_context *ac, u64 *suballoc_loc, u16 *suballoc_bit, u64 di_blkno) { int ret; u16 chain; struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; struct buffer_head *bg_bh = NULL; struct ocfs2_group_desc *bg; struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; /* * Since di_blkno is being passed back in, we check for any * inconsistencies which may have happened between * calls. These are code bugs as di_blkno is not expected to * change once returned from ocfs2_find_new_inode_loc() */ BUG_ON(res->sr_blkno != di_blkno); ret = ocfs2_read_group_descriptor(ac->ac_inode, di, res->sr_bg_stable_blkno, &bg_bh); if (ret) { mlog_errno(ret); goto out; } bg = (struct ocfs2_group_desc *) bg_bh->b_data; chain = le16_to_cpu(bg->bg_chain); ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, ac->ac_bh, res->sr_bits, chain); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_block_group_set_bits(handle, ac->ac_inode, bg, bg_bh, res->sr_bit_offset, res->sr_bits, res->sr_max_contig_bits, 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, ac->ac_bh, res->sr_bits, chain); mlog_errno(ret); goto out; } trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, res->sr_bits); atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); BUG_ON(res->sr_bits != 1); *suballoc_loc = res->sr_bg_blkno; *suballoc_bit = res->sr_bit_offset; ac->ac_bits_given++; ocfs2_save_inode_ac_group(dir, ac); out: brelse(bg_bh); return ret; } int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, u64 *suballoc_loc, u16 *suballoc_bit, u64 *fe_blkno) { int status; struct ocfs2_suballoc_result res; BUG_ON(!ac); BUG_ON(ac->ac_bits_given != 0); BUG_ON(ac->ac_bits_wanted != 1); BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); status = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, &res); if (status < 0) { mlog_errno(status); goto bail; } atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); BUG_ON(res.sr_bits != 1); *suballoc_loc = res.sr_bg_blkno; *suballoc_bit = res.sr_bit_offset; *fe_blkno = res.sr_blkno; ac->ac_bits_given++; ocfs2_save_inode_ac_group(dir, ac); status = 0; bail: if (status) mlog_errno(status); return status; } /* translate a group desc. blkno and it's bitmap offset into * disk cluster offset. */ static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, u64 bg_blkno, u16 bg_bit_off) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u32 cluster = 0; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); if (bg_blkno != osb->first_cluster_group_blkno) cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); cluster += (u32) bg_bit_off; return cluster; } /* given a cluster offset, calculate which block group it belongs to * and return that block offset. */ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u32 group_no; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); group_no = cluster / osb->bitmap_cpg; if (!group_no) return osb->first_cluster_group_blkno; return ocfs2_clusters_to_blocks(inode->i_sb, group_no * osb->bitmap_cpg); } /* given the block number of a cluster start, calculate which cluster * group and descriptor bitmap offset that corresponds to. */ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, u16 *bg_bit_off) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); BUG_ON(!ocfs2_is_cluster_bitmap(inode)); *bg_blkno = ocfs2_which_cluster_group(inode, data_cluster); if (*bg_blkno == osb->first_cluster_group_blkno) *bg_bit_off = (u16) data_cluster; else *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, data_blkno - *bg_blkno); } /* * min_bits - minimum contiguous chunk from this total allocation we * can handle. set to what we asked for originally for a full * contig. allocation, set to '1' to indicate we can deal with extents * of any size. */ int __ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 max_clusters, u32 *cluster_start, u32 *num_clusters) { int status; unsigned int bits_wanted = max_clusters; struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL && ac->ac_which != OCFS2_AC_USE_MAIN); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { WARN_ON(min_clusters > 1); status = ocfs2_claim_local_alloc_bits(osb, handle, ac, bits_wanted, cluster_start, num_clusters); if (!status) atomic_inc(&osb->alloc_stats.local_data); } else { if (min_clusters > (osb->bitmap_cpg - 1)) { /* The only paths asking for contiguousness * should know about this already. */ mlog(ML_ERROR, "minimum allocation requested %u exceeds " "group bitmap size %u!\n", min_clusters, osb->bitmap_cpg); status = -ENOSPC; goto bail; } /* clamp the current request down to a realistic size. */ if (bits_wanted > (osb->bitmap_cpg - 1)) bits_wanted = osb->bitmap_cpg - 1; status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, min_clusters, &res); if (!status) { BUG_ON(res.sr_blkno); /* cluster alloc can't set */ *cluster_start = ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, res.sr_bg_blkno, res.sr_bit_offset); atomic_inc(&osb->alloc_stats.bitmap_data); *num_clusters = res.sr_bits; } } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); goto bail; } ac->ac_bits_given += *num_clusters; bail: if (status) mlog_errno(status); return status; } int ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 *cluster_start, u32 *num_clusters) { unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; return __ocfs2_claim_clusters(handle, ac, min_clusters, bits_wanted, cluster_start, num_clusters); } static int ocfs2_block_group_clear_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, unsigned int num_bits, unsigned int max_contig_bits, void (*undo_fn)(unsigned int bit, unsigned long *bmap)) { int status; unsigned int tmp; u16 contig_bits; struct ocfs2_group_desc *undo_bg = NULL; struct journal_head *jh; /* The caller got this descriptor from * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); trace_ocfs2_block_group_clear_bits(bit_off, num_bits); BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), group_bh, undo_fn ? OCFS2_JOURNAL_ACCESS_UNDO : OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } jh = bh2jh(group_bh); if (undo_fn) { spin_lock(&jh->b_state_lock); undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; BUG_ON(!undo_bg); } tmp = num_bits; while(tmp--) { ocfs2_clear_bit((bit_off + tmp), (unsigned long *) bg->bg_bitmap); if (undo_fn) undo_fn(bit_off + tmp, (unsigned long *) undo_bg->bg_bitmap); } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { if (undo_fn) spin_unlock(&jh->b_state_lock); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), le16_to_cpu(bg->bg_free_bits_count), num_bits); } /* * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), * we still need to rescan whole bitmap. */ if (ocfs2_is_cluster_bitmap(alloc_inode)) { contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, le16_to_cpu(bg->bg_bits), 0); if (contig_bits > max_contig_bits) max_contig_bits = contig_bits; bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); } else { bg->bg_contig_free_bits = 0; } if (undo_fn) spin_unlock(&jh->b_state_lock); ocfs2_journal_dirty(handle, group_bh); bail: return status; } /* * expects the suballoc inode to already be locked. */ static int _ocfs2_free_suballoc_bits(handle_t *handle, struct inode *alloc_inode, struct buffer_head *alloc_bh, unsigned int start_bit, u64 bg_blkno, unsigned int count, void (*undo_fn)(unsigned int bit, unsigned long *bitmap)) { int status = 0; u32 tmp_used; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; __le16 old_bg_contig_free_bits = 0; /* The alloc_bh comes from ocfs2_free_dinode() or * ocfs2_free_clusters(). The callers have all locked the * allocator and gotten alloc_bh from the lock call. This * validates the dinode buffer. Any corruption that has happened * is a code bug. */ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); trace_ocfs2_free_suballoc_bits( (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, (unsigned long long)bg_blkno, start_bit, count); status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, &group_bh); if (status < 0) { mlog_errno(status); goto bail; } group = (struct ocfs2_group_desc *) group_bh->b_data; BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); if (ocfs2_is_cluster_bitmap(alloc_inode)) old_bg_contig_free_bits = group->bg_contig_free_bits; status = ocfs2_block_group_clear_bits(handle, alloc_inode, group, group_bh, start_bit, count, 0, undo_fn); if (status < 0) { mlog_errno(status); goto bail; } status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, start_bit, count, le16_to_cpu(old_bg_contig_free_bits), 1); goto bail; } le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, count); tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); ocfs2_journal_dirty(handle, alloc_bh); bail: brelse(group_bh); return status; } int ocfs2_free_suballoc_bits(handle_t *handle, struct inode *alloc_inode, struct buffer_head *alloc_bh, unsigned int start_bit, u64 bg_blkno, unsigned int count) { return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, start_bit, bg_blkno, count, NULL); } int ocfs2_free_dinode(handle_t *handle, struct inode *inode_alloc_inode, struct buffer_head *inode_alloc_bh, struct ocfs2_dinode *di) { u64 blk = le64_to_cpu(di->i_blkno); u16 bit = le16_to_cpu(di->i_suballoc_bit); u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); if (di->i_suballoc_loc) bg_blkno = le64_to_cpu(di->i_suballoc_loc); return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, inode_alloc_bh, bit, bg_blkno, 1); } static int _ocfs2_free_clusters(handle_t *handle, struct inode *bitmap_inode, struct buffer_head *bitmap_bh, u64 start_blk, unsigned int num_clusters, void (*undo_fn)(unsigned int bit, unsigned long *bitmap)) { int status; u16 bg_start_bit; u64 bg_blkno; /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry * about looping on them. * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); trace_ocfs2_free_clusters((unsigned long long)bg_blkno, (unsigned long long)start_blk, bg_start_bit, num_clusters); status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, bg_start_bit, bg_blkno, num_clusters, undo_fn); if (status < 0) { mlog_errno(status); goto out; } ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), num_clusters); out: return status; } int ocfs2_free_clusters(handle_t *handle, struct inode *bitmap_inode, struct buffer_head *bitmap_bh, u64 start_blk, unsigned int num_clusters) { return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, start_blk, num_clusters, _ocfs2_set_bit); } /* * Give never-used clusters back to the global bitmap. We don't need * to protect these bits in the undo buffer. */ int ocfs2_release_clusters(handle_t *handle, struct inode *bitmap_inode, struct buffer_head *bitmap_bh, u64 start_blk, unsigned int num_clusters) { return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, start_blk, num_clusters, _ocfs2_clear_bit); } /* * For a given allocation, determine which allocators will need to be * accessed, and lock them, reserving the appropriate number of bits. * * Sparse file systems call this from ocfs2_write_begin_nolock() * and ocfs2_allocate_unwritten_extents(). * * File systems which don't support holes call this from * ocfs2_extend_allocation(). */ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, struct ocfs2_alloc_context **meta_ac) { int ret = 0, num_free_extents; unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); *meta_ac = NULL; if (data_ac) *data_ac = NULL; BUG_ON(clusters_to_add != 0 && data_ac == NULL); num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); goto out; } /* * Sparse allocation file systems need to be more conservative * with reserving room for expansion - the actual allocation * happens while we've got a journal handle open so re-taking * a cluster lock (because we ran out of room for another * extent) will violate ordering rules. * * Most of the time we'll only be seeing this 1 cluster at a time * anyway. * * Always lock for any unwritten extents - we might want to * add blocks during a split. */ if (!num_free_extents || (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } } if (clusters_to_add == 0) goto out; ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } out: if (ret) { if (*meta_ac) { ocfs2_free_alloc_context(*meta_ac); *meta_ac = NULL; } /* * We cannot have an error and a non null *data_ac. */ } return ret; } /* * Read the inode specified by blkno to get suballoc_slot and * suballoc_bit. */ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, u16 *suballoc_slot, u64 *group_blkno, u16 *suballoc_bit) { int status; struct buffer_head *inode_bh = NULL; struct ocfs2_dinode *inode_fe; trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); /* dirty read disk */ status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); if (status < 0) { mlog(ML_ERROR, "read block %llu failed %d\n", (unsigned long long)blkno, status); goto bail; } inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; if (!OCFS2_IS_VALID_DINODE(inode_fe)) { mlog(ML_ERROR, "invalid inode %llu requested\n", (unsigned long long)blkno); status = -EINVAL; goto bail; } if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", (unsigned long long)blkno, (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); status = -EINVAL; goto bail; } if (suballoc_slot) *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); if (suballoc_bit) *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); if (group_blkno) *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); bail: brelse(inode_bh); if (status) mlog_errno(status); return status; } /* * test whether bit is SET in allocator bitmap or not. on success, 0 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno * is returned and *res is meaningless. Call this after you have * cluster locked against suballoc, or you may get a result based on * non-up2date contents */ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, struct buffer_head *alloc_bh, u64 group_blkno, u64 blkno, u16 bit, int *res) { struct ocfs2_dinode *alloc_di; struct ocfs2_group_desc *group; struct buffer_head *group_bh = NULL; u64 bg_blkno; int status; trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, (unsigned int)bit); alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", (unsigned int)bit, ocfs2_bits_per_group(&alloc_di->id2.i_chain)); status = -EINVAL; goto bail; } bg_blkno = group_blkno ? group_blkno : ocfs2_which_suballoc_group(blkno, bit); status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { mlog(ML_ERROR, "read group %llu failed %d\n", (unsigned long long)bg_blkno, status); goto bail; } group = (struct ocfs2_group_desc *) group_bh->b_data; *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); bail: brelse(group_bh); if (status) mlog_errno(status); return status; } /* * Test if the bit representing this inode (blkno) is set in the * suballocator. * * On success, 0 is returned and *res is 1 for SET; 0 otherwise. * * In the event of failure, a negative value is returned and *res is * meaningless. * * Callers must make sure to hold nfs_sync_lock to prevent * ocfs2_delete_inode() on another node from accessing the same * suballocator concurrently. */ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) { int status; u64 group_blkno = 0; u16 suballoc_bit = 0, suballoc_slot = 0; struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; trace_ocfs2_test_inode_bit((unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, &group_blkno, &suballoc_bit); if (status < 0) { mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); goto bail; } if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) inode_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); else inode_alloc_inode = ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, suballoc_slot); if (!inode_alloc_inode) { /* the error code could be inaccurate, but we are not able to * get the correct one. */ status = -EINVAL; mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", (u32)suballoc_slot); goto bail; } inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); goto bail; } status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, group_blkno, blkno, suballoc_bit, res); if (status < 0) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); bail: if (status) mlog_errno(status); return status; }
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 /* SPDX-License-Identifier: GPL-2.0 */ /* taskstats_kern.h - kernel header for per-task statistics interface * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 * (C) Balbir Singh, IBM Corp. 2006 */ #ifndef _LINUX_TASKSTATS_KERN_H #define _LINUX_TASKSTATS_KERN_H #include <linux/taskstats.h> #include <linux/sched/signal.h> #include <linux/slab.h> #ifdef CONFIG_TASKSTATS extern struct kmem_cache *taskstats_cache; extern struct mutex taskstats_exit_mutex; static inline void taskstats_tgid_free(struct signal_struct *sig) { if (sig->stats) kmem_cache_free(taskstats_cache, sig->stats); } extern void taskstats_exit(struct task_struct *, int group_dead); extern void taskstats_init_early(void); #else static inline void taskstats_exit(struct task_struct *tsk, int group_dead) {} static inline void taskstats_tgid_free(struct signal_struct *sig) {} static inline void taskstats_init_early(void) {} #endif /* CONFIG_TASKSTATS */ #endif
15 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com> * * Based on the shift-and-subtract algorithm for computing integer * square root from Guy L. Steele. */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/limits.h> #include <linux/math.h> /** * int_sqrt - computes the integer square root * @x: integer of which to calculate the sqrt * * Computes: floor(sqrt(x)) */ unsigned long int_sqrt(unsigned long x) { unsigned long b, m, y = 0; if (x <= 1) return x; m = 1UL << (__fls(x) & ~1UL); while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } return y; } EXPORT_SYMBOL(int_sqrt); #if BITS_PER_LONG < 64 /** * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input * is expected. * @x: 64bit integer of which to calculate the sqrt */ u32 int_sqrt64(u64 x) { u64 b, m, y = 0; if (x <= ULONG_MAX) return int_sqrt((unsigned long) x); m = 1ULL << ((fls64(x) - 1) & ~1ULL); while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } return y; } EXPORT_SYMBOL(int_sqrt64); #endif
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/mii.h: definitions for MII-compatible transceivers * Originally drivers/net/sunhme.h. * * Copyright (C) 1996, 1999, 2001 David S. Miller (davem@redhat.com) */ #ifndef __LINUX_MII_H__ #define __LINUX_MII_H__ #include <linux/if.h> #include <linux/linkmode.h> #include <uapi/linux/mii.h> struct ethtool_cmd; struct mii_if_info { int phy_id; int advertising; int phy_id_mask; int reg_num_mask; unsigned int full_duplex : 1; /* is full duplex? */ unsigned int force_media : 1; /* is autoneg. disabled? */ unsigned int supports_gmii : 1; /* are GMII registers supported? */ struct net_device *dev; int (*mdio_read) (struct net_device *dev, int phy_id, int location); void (*mdio_write) (struct net_device *dev, int phy_id, int location, int val); }; extern int mii_link_ok (struct mii_if_info *mii); extern int mii_nway_restart (struct mii_if_info *mii); extern void mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern void mii_ethtool_get_link_ksettings( struct mii_if_info *mii, struct ethtool_link_ksettings *cmd); extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd); extern int mii_ethtool_set_link_ksettings( struct mii_if_info *mii, const struct ethtool_link_ksettings *cmd); extern int mii_check_gmii_support(struct mii_if_info *mii); extern void mii_check_link (struct mii_if_info *mii); extern unsigned int mii_check_media (struct mii_if_info *mii, unsigned int ok_to_print, unsigned int init_media); extern int generic_mii_ioctl(struct mii_if_info *mii_if, struct mii_ioctl_data *mii_data, int cmd, unsigned int *duplex_changed); static inline struct mii_ioctl_data *if_mii(struct ifreq *rq) { return (struct mii_ioctl_data *) &rq->ifr_ifru; } /** * mii_nway_result * @negotiated: value of MII ANAR and'd with ANLPAR * * Given a set of MII abilities, check each bit and returns the * currently supported media, in the priority order defined by * IEEE 802.3u. We use LPA_xxx constants but note this is not the * value of LPA solely, as described above. * * The one exception to IEEE 802.3u is that 100baseT4 is placed * between 100T-full and 100T-half. If your phy does not support * 100T4 this is fine. If your phy places 100T4 elsewhere in the * priority order, you will need to roll your own function. */ static inline unsigned int mii_nway_result (unsigned int negotiated) { unsigned int ret; if (negotiated & LPA_100FULL) ret = LPA_100FULL; else if (negotiated & LPA_100BASE4) ret = LPA_100BASE4; else if (negotiated & LPA_100HALF) ret = LPA_100HALF; else if (negotiated & LPA_10FULL) ret = LPA_10FULL; else ret = LPA_10HALF; return ret; } /** * mii_duplex * @duplex_lock: Non-zero if duplex is locked at full * @negotiated: value of MII ANAR and'd with ANLPAR * * A small helper function for a common case. Returns one * if the media is operating or locked at full duplex, and * returns zero otherwise. */ static inline unsigned int mii_duplex (unsigned int duplex_lock, unsigned int negotiated) { if (duplex_lock) return 1; if (mii_nway_result(negotiated) & LPA_DUPLEX) return 1; return 0; } /** * ethtool_adv_to_mii_adv_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 ethtool_adv_to_mii_adv_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_10baseT_Half) result |= ADVERTISE_10HALF; if (ethadv & ADVERTISED_10baseT_Full) result |= ADVERTISE_10FULL; if (ethadv & ADVERTISED_100baseT_Half) result |= ADVERTISE_100HALF; if (ethadv & ADVERTISED_100baseT_Full) result |= ADVERTISE_100FULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_PAUSE_CAP; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * linkmode_adv_to_mii_adv_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ static inline u32 linkmode_adv_to_mii_adv_t(const unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising)) result |= ADVERTISE_10HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising)) result |= ADVERTISE_10FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising)) result |= ADVERTISE_100HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising)) result |= ADVERTISE_100FULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) result |= ADVERTISE_PAUSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_t * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to ethtool advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_10HALF) result |= ADVERTISED_10baseT_Half; if (adv & ADVERTISE_10FULL) result |= ADVERTISED_10baseT_Full; if (adv & ADVERTISE_100HALF) result |= ADVERTISED_100baseT_Half; if (adv & ADVERTISE_100FULL) result |= ADVERTISED_100baseT_Full; if (adv & ADVERTISE_PAUSE_CAP) result |= ADVERTISED_Pause; if (adv & ADVERTISE_PAUSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * ethtool_adv_to_mii_ctrl1000_t * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 ethtool_adv_to_mii_ctrl1000_t(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000HALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000FULL; return result; } /** * linkmode_adv_to_mii_ctrl1000_t * @advertising: the linkmode advertisement settings * * A small helper function that translates linkmode advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ static inline u32 linkmode_adv_to_mii_ctrl1000_t(const unsigned long *advertising) { u32 result = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising)) result |= ADVERTISE_1000HALF; if (linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising)) result |= ADVERTISE_1000FULL; return result; } /** * mii_ctrl1000_to_ethtool_adv_t * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_ctrl1000_to_ethtool_adv_t(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000HALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_lpa_to_ethtool_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA * bits, when in 1000Base-T mode, to ethtool * LP advertisement settings. */ static inline u32 mii_lpa_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_LPACK) result |= ADVERTISED_Autoneg; return result | mii_adv_to_ethtool_adv_t(lpa); } /** * mii_stat1000_to_ethtool_lpa_t * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 * bits, when in 1000Base-T mode, to ethtool * advertisement settings. */ static inline u32 mii_stat1000_to_ethtool_lpa_t(u32 lpa) { u32 result = 0; if (lpa & LPA_1000HALF) result |= ADVERTISED_1000baseT_Half; if (lpa & LPA_1000FULL) result |= ADVERTISED_1000baseT_Full; return result; } /** * mii_stat1000_mod_linkmode_lpa_t * @advertising: target the linkmode advertisement settings * @adv: value of the MII_STAT1000 register * * A small helper function that translates MII_STAT1000 bits, when in * 1000Base-T mode, to linkmode advertisement settings. Other bits in * advertising are not changes. */ static inline void mii_stat1000_mod_linkmode_lpa_t(unsigned long *advertising, u32 lpa) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, lpa & LPA_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, lpa & LPA_1000FULL); } /** * ethtool_adv_to_mii_adv_x * @ethadv: the ethtool advertisement settings * * A small helper function that translates ethtool advertisement * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000Base-X mode. */ static inline u32 ethtool_adv_to_mii_adv_x(u32 ethadv) { u32 result = 0; if (ethadv & ADVERTISED_1000baseT_Half) result |= ADVERTISE_1000XHALF; if (ethadv & ADVERTISED_1000baseT_Full) result |= ADVERTISE_1000XFULL; if (ethadv & ADVERTISED_Pause) result |= ADVERTISE_1000XPAUSE; if (ethadv & ADVERTISED_Asym_Pause) result |= ADVERTISE_1000XPSE_ASYM; return result; } /** * mii_adv_to_ethtool_adv_x * @adv: value of the MII_CTRL1000 register * * A small helper function that translates MII_CTRL1000 * bits, when in 1000Base-X mode, to ethtool * advertisement settings. */ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv) { u32 result = 0; if (adv & ADVERTISE_1000XHALF) result |= ADVERTISED_1000baseT_Half; if (adv & ADVERTISE_1000XFULL) result |= ADVERTISED_1000baseT_Full; if (adv & ADVERTISE_1000XPAUSE) result |= ADVERTISED_Pause; if (adv & ADVERTISE_1000XPSE_ASYM) result |= ADVERTISED_Asym_Pause; return result; } /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits to * linkmode advertisement settings. Leaves other bits unchanged. */ static inline void mii_adv_mod_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, advertising, adv & ADVERTISE_10HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, advertising, adv & ADVERTISE_10FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, advertising, adv & ADVERTISE_100HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, advertising, adv & ADVERTISE_100FULL); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_CAP); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising, adv & ADVERTISE_PAUSE_ASYM); } /** * mii_adv_to_linkmode_adv_t * @advertising:pointer to destination link mode. * @adv: value of the MII_ADVERTISE register * * A small helper function that translates MII_ADVERTISE bits * to linkmode advertisement settings. Clears the old value * of advertising. */ static inline void mii_adv_to_linkmode_adv_t(unsigned long *advertising, u32 adv) { linkmode_zero(advertising); mii_adv_mod_linkmode_adv_t(advertising, adv); } /** * mii_lpa_to_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Clears the * old value of advertising */ static inline void mii_lpa_to_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_to_linkmode_adv_t(lp_advertising, lpa); if (lpa & LPA_LPACK) linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising); } /** * mii_lpa_mod_linkmode_lpa_t * @adv: value of the MII_LPA register * * A small helper function that translates MII_LPA bits, when in * 1000Base-T mode, to linkmode LP advertisement settings. Leaves * other bits unchanged. */ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising, u32 lpa) { mii_adv_mod_linkmode_adv_t(lp_advertising, lpa); linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, lp_advertising, lpa & LPA_LPACK); } static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, u32 ctrl1000) { linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising, ctrl1000 & ADVERTISE_1000HALF); linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising, ctrl1000 & ADVERTISE_1000FULL); } /** * linkmode_adv_to_lcl_adv_t * @advertising:pointer to linkmode advertising * * A small helper function that translates linkmode advertising to LVL * pause capabilities. */ static inline u32 linkmode_adv_to_lcl_adv_t(const unsigned long *advertising) { u32 lcl_adv = 0; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_CAP; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, advertising)) lcl_adv |= ADVERTISE_PAUSE_ASYM; return lcl_adv; } /** * mii_lpa_mod_linkmode_x - decode the link partner's config_reg to linkmodes * @linkmodes: link modes array * @lpa: config_reg word from link partner * @fd_bit: link mode for 1000XFULL bit */ static inline void mii_lpa_mod_linkmode_x(unsigned long *linkmodes, u16 lpa, int fd_bit) { linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, linkmodes, lpa & LPA_LPACK); linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE); linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes, lpa & LPA_1000XPAUSE_ASYM); linkmode_mod_bit(fd_bit, linkmodes, lpa & LPA_1000XFULL); } /** * linkmode_adv_to_mii_adv_x - encode a linkmode to config_reg * @linkmodes: linkmodes * @fd_bit: full duplex bit */ static inline u16 linkmode_adv_to_mii_adv_x(const unsigned long *linkmodes, int fd_bit) { u16 adv = 0; if (linkmode_test_bit(fd_bit, linkmodes)) adv |= ADVERTISE_1000XFULL; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPAUSE; if (linkmode_test_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, linkmodes)) adv |= ADVERTISE_1000XPSE_ASYM; return adv; } /** * mii_advertise_flowctrl - get flow control advertisement flags * @cap: Flow control capabilities (FLOW_CTRL_RX, FLOW_CTRL_TX or both) */ static inline u16 mii_advertise_flowctrl(int cap) { u16 adv = 0; if (cap & FLOW_CTRL_RX) adv = ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM; if (cap & FLOW_CTRL_TX) adv ^= ADVERTISE_PAUSE_ASYM; return adv; } /** * mii_resolve_flowctrl_fdx * @lcladv: value of MII ADVERTISE register * @rmtadv: value of MII LPA register * * Resolve full duplex flow control as per IEEE 802.3-2005 table 28B-3 */ static inline u8 mii_resolve_flowctrl_fdx(u16 lcladv, u16 rmtadv) { u8 cap = 0; if (lcladv & rmtadv & ADVERTISE_PAUSE_CAP) { cap = FLOW_CTRL_TX | FLOW_CTRL_RX; } else if (lcladv & rmtadv & ADVERTISE_PAUSE_ASYM) { if (lcladv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_RX; else if (rmtadv & ADVERTISE_PAUSE_CAP) cap = FLOW_CTRL_TX; } return cap; } /** * mii_bmcr_encode_fixed - encode fixed speed/duplex settings to a BMCR value * @speed: a SPEED_* value * @duplex: a DUPLEX_* value * * Encode the speed and duplex to a BMCR value. 2500, 1000, 100 and 10 Mbps are * supported. 2500Mbps is encoded to 1000Mbps. Other speeds are encoded as 10 * Mbps. Unknown duplex values are encoded to half-duplex. */ static inline u16 mii_bmcr_encode_fixed(int speed, int duplex) { u16 bmcr; switch (speed) { case SPEED_2500: case SPEED_1000: bmcr = BMCR_SPEED1000; break; case SPEED_100: bmcr = BMCR_SPEED100; break; case SPEED_10: default: bmcr = BMCR_SPEED10; break; } if (duplex == DUPLEX_FULL) bmcr |= BMCR_FULLDPLX; return bmcr; } #endif /* __LINUX_MII_H__ */
132 21 21 21 119 117 22 22 22 22 22 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET6_HASHTABLES_H #define _INET6_HASHTABLES_H #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/types.h> #include <linux/jhash.h> #include <net/inet_sock.h> #include <net/ipv6.h> #include <net/netns/hash.h> struct inet_hashinfo; static inline unsigned int __inet6_ehashfn(const u32 lhash, const u16 lport, const u32 fhash, const __be16 fport, const u32 initval) { const u32 ports = (((u32)lport) << 16) | (__force u32)fport; return jhash_3words(lhash, fhash, ports, initval); } /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet6_ehashfn_t)(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport); inet6_ehashfn_t inet6_ehashfn; INDIRECT_CALLABLE_DECLARE(inet6_ehashfn_t udp6_ehashfn); struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, inet6_ehashfn_t *ehashfn); struct sock *inet6_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif); struct sock *inet6_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, inet6_ehashfn_t *ehashfn); static inline struct sock *__inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif, bool *refcounted) { struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, bool *refcounted, inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif, int sdif, bool *refcounted) { struct net *net = dev_net(skb_dst(skb)->dev); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; sk = inet6_steal_sock(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, dport, refcounted, inet6_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet6_lookup(net, hashinfo, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } struct sock *inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); int inet6_hash(struct sock *sk); static inline bool inet6_match(const struct net *net, const struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, const __portpair ports, const int dif, const int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_family != AF_INET6 || sk->sk_portpair != ports || !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) || !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _INET6_HASHTABLES_H */
27 3638 3634 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 #ifndef _LINUX_SCHED_ISOLATION_H #define _LINUX_SCHED_ISOLATION_H #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/init.h> #include <linux/tick.h> enum hk_type { HK_TYPE_TIMER, HK_TYPE_RCU, HK_TYPE_MISC, HK_TYPE_SCHED, HK_TYPE_TICK, HK_TYPE_DOMAIN, HK_TYPE_WQ, HK_TYPE_MANAGED_IRQ, HK_TYPE_KTHREAD, HK_TYPE_MAX }; #ifdef CONFIG_CPU_ISOLATION DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); extern int housekeeping_any_cpu(enum hk_type type); extern const struct cpumask *housekeeping_cpumask(enum hk_type type); extern bool housekeeping_enabled(enum hk_type type); extern void housekeeping_affine(struct task_struct *t, enum hk_type type); extern bool housekeeping_test_cpu(int cpu, enum hk_type type); extern void __init housekeeping_init(void); #else static inline int housekeeping_any_cpu(enum hk_type type) { return smp_processor_id(); } static inline const struct cpumask *housekeeping_cpumask(enum hk_type type) { return cpu_possible_mask; } static inline bool housekeeping_enabled(enum hk_type type) { return false; } static inline void housekeeping_affine(struct task_struct *t, enum hk_type type) { } static inline bool housekeeping_test_cpu(int cpu, enum hk_type type) { return true; } static inline void housekeeping_init(void) { } #endif /* CONFIG_CPU_ISOLATION */ static inline bool housekeeping_cpu(int cpu, enum hk_type type) { #ifdef CONFIG_CPU_ISOLATION if (static_branch_unlikely(&housekeeping_overridden)) return housekeeping_test_cpu(cpu, type); #endif return true; } static inline bool cpu_is_isolated(int cpu) { return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || cpuset_cpu_is_isolated(cpu); } #endif /* _LINUX_SCHED_ISOLATION_H */
409 32 303 42 187 33 159 194 13 4 211 72 159 18 39 2 284 210 41 226 41 43 32 128 42 152 98 184 171 171 181 199 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FAT_H #define _FAT_H #include <linux/buffer_head.h> #include <linux/nls.h> #include <linux/hash.h> #include <linux/ratelimit.h> #include <linux/msdos_fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> /* * vfat shortname flags */ #define VFAT_SFN_DISPLAY_LOWER 0x0001 /* convert to lowercase for display */ #define VFAT_SFN_DISPLAY_WIN95 0x0002 /* emulate win95 rule for display */ #define VFAT_SFN_DISPLAY_WINNT 0x0004 /* emulate winnt rule for display */ #define VFAT_SFN_CREATE_WIN95 0x0100 /* emulate win95 rule for create */ #define VFAT_SFN_CREATE_WINNT 0x0200 /* emulate winnt rule for create */ #define FAT_ERRORS_CONT 1 /* ignore error and continue */ #define FAT_ERRORS_PANIC 2 /* panic on error */ #define FAT_ERRORS_RO 3 /* remount r/o on error */ #define FAT_NFS_STALE_RW 1 /* NFS RW support, can cause ESTALE */ #define FAT_NFS_NOSTALE_RO 2 /* NFS RO support, no ESTALE issue */ struct fat_mount_options { kuid_t fs_uid; kgid_t fs_gid; unsigned short fs_fmask; unsigned short fs_dmask; unsigned short codepage; /* Codepage for shortname conversions */ int time_offset; /* Offset of timestamps from UTC (in minutes) */ char *iocharset; /* Charset used for filename input/display */ unsigned short shortname; /* flags for shortname display/create rule */ unsigned char name_check; /* r = relaxed, n = normal, s = strict */ unsigned char errors; /* On error: continue, panic, remount-ro */ unsigned char nfs; /* NFS support: nostale_ro, stale_rw */ unsigned short allow_utime;/* permission for setting the [am]time */ unsigned quiet:1, /* set = fake successful chmods and chowns */ showexec:1, /* set = only set x bit for com/exe/bat */ sys_immutable:1, /* set = system files are immutable */ dotsOK:1, /* set = hidden and system files are named '.filename' */ isvfat:1, /* 0=no vfat long filename support, 1=vfat support */ utf8:1, /* Use of UTF-8 character set (Default) */ unicode_xlate:1, /* create escape sequences for unhandled Unicode */ numtail:1, /* Does first alias have a numeric '~1' type tail? */ flush:1, /* write things quickly */ nocase:1, /* Does this need case conversion? 0=need case conversion*/ usefree:1, /* Use free_clusters for FAT32 */ tz_set:1, /* Filesystem timestamps' offset set */ rodir:1, /* allow ATTR_RO for directory */ discard:1, /* Issue discard requests on deletions */ dos1xfloppy:1, /* Assume default BPB for DOS 1.x floppies */ debug:1; /* Not currently used */ }; #define FAT_HASH_BITS 8 #define FAT_HASH_SIZE (1UL << FAT_HASH_BITS) /* * MS-DOS file system in-core superblock data */ struct msdos_sb_info { unsigned short sec_per_clus; /* sectors/cluster */ unsigned short cluster_bits; /* log2(cluster_size) */ unsigned int cluster_size; /* cluster size */ unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */ unsigned short fat_start; unsigned long fat_length; /* FAT start & length (sec.) */ unsigned long dir_start; unsigned short dir_entries; /* root dir start & entries */ unsigned long data_start; /* first data sector */ unsigned long max_cluster; /* maximum cluster number */ unsigned long root_cluster; /* first cluster of the root directory */ unsigned long fsinfo_sector; /* sector number of FAT32 fsinfo */ struct mutex fat_lock; struct mutex nfs_build_inode_lock; struct mutex s_lock; unsigned int prev_free; /* previously allocated cluster number */ unsigned int free_clusters; /* -1 if undefined */ unsigned int free_clus_valid; /* is free_clusters valid? */ struct fat_mount_options options; struct nls_table *nls_disk; /* Codepage used on disk */ struct nls_table *nls_io; /* Charset used for input and display */ const void *dir_ops; /* Opaque; default directory operations */ int dir_per_block; /* dir entries per block */ int dir_per_block_bits; /* log2(dir_per_block) */ unsigned int vol_id; /*volume ID*/ int fatent_shift; const struct fatent_operations *fatent_ops; struct inode *fat_inode; struct inode *fsinfo_inode; struct ratelimit_state ratelimit; spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; spinlock_t dir_hash_lock; struct hlist_head dir_hashtable[FAT_HASH_SIZE]; unsigned int dirty; /* fs state before mount */ struct rcu_head rcu; }; #define FAT_CACHE_VALID 0 /* special case for valid cache */ /* * MS-DOS file system inode data in memory */ struct msdos_inode_info { spinlock_t cache_lru_lock; struct list_head cache_lru; int nr_caches; /* for avoiding the race between fat_free() and fat_get_cluster() */ unsigned int cache_valid_id; /* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */ loff_t mmu_private; /* physically allocated size */ int i_start; /* first cluster or 0 */ int i_logstart; /* logical first cluster */ int i_attrs; /* unused attribute bits */ loff_t i_pos; /* on-disk position of directory entry or 0 */ struct hlist_node i_fat_hash; /* hash by i_location */ struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ struct timespec64 i_crtime; /* File creation (birth) time */ struct inode vfs_inode; }; struct fat_slot_info { loff_t i_pos; /* on-disk position of directory entry */ loff_t slot_off; /* offset for slot or de start */ int nr_slots; /* number of slots + 1(de) in filename */ struct msdos_dir_entry *de; struct buffer_head *bh; }; static inline struct msdos_sb_info *MSDOS_SB(struct super_block *sb) { return sb->s_fs_info; } /* * Functions that determine the variant of the FAT file system (i.e., * whether this is FAT12, FAT16 or FAT32. */ static inline bool is_fat12(const struct msdos_sb_info *sbi) { return sbi->fat_bits == 12; } static inline bool is_fat16(const struct msdos_sb_info *sbi) { return sbi->fat_bits == 16; } static inline bool is_fat32(const struct msdos_sb_info *sbi) { return sbi->fat_bits == 32; } /* Maximum number of clusters */ static inline u32 max_fat(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); return is_fat32(sbi) ? MAX_FAT32 : is_fat16(sbi) ? MAX_FAT16 : MAX_FAT12; } static inline struct msdos_inode_info *MSDOS_I(struct inode *inode) { return container_of(inode, struct msdos_inode_info, vfs_inode); } /* * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to * save ATTR_RO instead of ->i_mode. * * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only * bit, it's just used as flag for app. */ static inline int fat_mode_can_hold_ro(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); umode_t mask; if (S_ISDIR(inode->i_mode)) { if (!sbi->options.rodir) return 0; mask = ~sbi->options.fs_dmask; } else mask = ~sbi->options.fs_fmask; if (!(mask & S_IWUGO)) return 0; return 1; } /* Convert attribute bits and a mask to the UNIX mode. */ static inline umode_t fat_make_mode(struct msdos_sb_info *sbi, u8 attrs, umode_t mode) { if (attrs & ATTR_RO && !((attrs & ATTR_DIR) && !sbi->options.rodir)) mode &= ~S_IWUGO; if (attrs & ATTR_DIR) return (mode & ~sbi->options.fs_dmask) | S_IFDIR; else return (mode & ~sbi->options.fs_fmask) | S_IFREG; } /* Return the FAT attribute byte for this inode */ static inline u8 fat_make_attrs(struct inode *inode) { u8 attrs = MSDOS_I(inode)->i_attrs; if (S_ISDIR(inode->i_mode)) attrs |= ATTR_DIR; if (fat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO)) attrs |= ATTR_RO; return attrs; } static inline void fat_save_attrs(struct inode *inode, u8 attrs) { if (fat_mode_can_hold_ro(inode)) MSDOS_I(inode)->i_attrs = attrs & ATTR_UNUSED; else MSDOS_I(inode)->i_attrs = attrs & (ATTR_UNUSED | ATTR_RO); } static inline unsigned char fat_checksum(const __u8 *name) { unsigned char s = name[0]; s = (s<<7) + (s>>1) + name[1]; s = (s<<7) + (s>>1) + name[2]; s = (s<<7) + (s>>1) + name[3]; s = (s<<7) + (s>>1) + name[4]; s = (s<<7) + (s>>1) + name[5]; s = (s<<7) + (s>>1) + name[6]; s = (s<<7) + (s>>1) + name[7]; s = (s<<7) + (s>>1) + name[8]; s = (s<<7) + (s>>1) + name[9]; s = (s<<7) + (s>>1) + name[10]; return s; } static inline sector_t fat_clus_to_blknr(struct msdos_sb_info *sbi, int clus) { return ((sector_t)clus - FAT_START_ENT) * sbi->sec_per_clus + sbi->data_start; } static inline void fat_get_blknr_offset(struct msdos_sb_info *sbi, loff_t i_pos, sector_t *blknr, int *offset) { *blknr = i_pos >> sbi->dir_per_block_bits; *offset = i_pos & (sbi->dir_per_block - 1); } static inline loff_t fat_i_pos_read(struct msdos_sb_info *sbi, struct inode *inode) { loff_t i_pos; #if BITS_PER_LONG == 32 spin_lock(&sbi->inode_hash_lock); #endif i_pos = MSDOS_I(inode)->i_pos; #if BITS_PER_LONG == 32 spin_unlock(&sbi->inode_hash_lock); #endif return i_pos; } static inline void fat16_towchar(wchar_t *dst, const __u8 *src, size_t len) { #ifdef __BIG_ENDIAN while (len--) { *dst++ = src[0] | (src[1] << 8); src += 2; } #else memcpy(dst, src, len * 2); #endif } static inline int fat_get_start(const struct msdos_sb_info *sbi, const struct msdos_dir_entry *de) { int cluster = le16_to_cpu(de->start); if (is_fat32(sbi)) cluster |= (le16_to_cpu(de->starthi) << 16); return cluster; } static inline void fat_set_start(struct msdos_dir_entry *de, int cluster) { de->start = cpu_to_le16(cluster); de->starthi = cpu_to_le16(cluster >> 16); } static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) { #ifdef __BIG_ENDIAN while (len--) { dst[0] = *src & 0x00FF; dst[1] = (*src & 0xFF00) >> 8; dst += 2; src++; } #else memcpy(dst, src, len * 2); #endif } /* fat/cache.c */ extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector, sector_t last_block, unsigned long *mapped_blocks, sector_t *bmap); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, unsigned long *mapped_blocks, int create, bool from_bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; extern int fat_search_long(struct inode *inode, const unsigned char *name, int name_len, struct fat_slot_info *sinfo); extern int fat_dir_empty(struct inode *dir); extern int fat_subdirs(struct inode *dir); extern int fat_scan(struct inode *dir, const unsigned char *name, struct fat_slot_info *sinfo); extern int fat_scan_logstart(struct inode *dir, int i_logstart, struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de); extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts); extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct fat_slot_info *sinfo); extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo); /* fat/fatent.c */ struct fat_entry { int entry; union { u8 *ent12_p[2]; __le16 *ent16_p; __le32 *ent32_p; } u; int nr_bhs; struct buffer_head *bhs[2]; struct inode *fat_inode; }; static inline void fatent_init(struct fat_entry *fatent) { fatent->nr_bhs = 0; fatent->entry = 0; fatent->u.ent32_p = NULL; fatent->bhs[0] = fatent->bhs[1] = NULL; fatent->fat_inode = NULL; } static inline void fatent_set_entry(struct fat_entry *fatent, int entry) { fatent->entry = entry; fatent->u.ent32_p = NULL; } static inline void fatent_brelse(struct fat_entry *fatent) { int i; fatent->u.ent32_p = NULL; for (i = 0; i < fatent->nr_bhs; i++) brelse(fatent->bhs[i]); fatent->nr_bhs = 0; fatent->bhs[0] = fatent->bhs[1] = NULL; fatent->fat_inode = NULL; } static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry) { return FAT_START_ENT <= entry && entry < sbi->max_cluster; } extern void fat_ent_access_init(struct super_block *sb); extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry); extern int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait); extern int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster); extern int fat_free_clusters(struct inode *inode, int cluster); extern int fat_count_free_clusters(struct super_block *sb); extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range); /* fat/file.c */ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; extern int fat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); extern int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* fat/inode.c */ extern int fat_block_truncate_page(struct inode *inode, loff_t from); extern void fat_attach(struct inode *inode, loff_t i_pos); extern void fat_detach(struct inode *inode); extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); extern struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos); extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, struct fs_context *fc, void (*setup)(struct super_block *)); extern int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); extern const struct fs_parameter_spec fat_param_spec[]; int fat_init_fs_context(struct fs_context *fc, bool is_vfat); void fat_free_fc(struct fs_context *fc); int fat_parse_param(struct fs_context *fc, struct fs_parameter *param, bool is_vfat); int fat_reconfigure(struct fs_context *fc); static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); } extern int fat_add_cluster(struct inode *inode); /* fat/misc.c */ extern __printf(3, 4) __cold void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); #define fat_fs_error(sb, fmt, args...) \ __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) #define FAT_PRINTK_PREFIX "%sFAT-fs (%s): " #define fat_msg(sb, level, fmt, args...) \ do { \ printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\ _fat_msg(sb, level, fmt, ##args); \ } while (0) __printf(3, 4) __cold void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); #define fat_msg_ratelimit(sb, level, fmt, args...) \ do { \ if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \ fat_msg(sb, level, fmt, ## args); \ } while (0) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); extern int fat_update_time(struct inode *inode, int flags); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); void fat_cache_destroy(void); /* fat/nfs.c */ extern const struct export_operations fat_export_ops; extern const struct export_operations fat_export_ops_nostale; /* helper for printk */ typedef unsigned long long llu; #endif /* !_FAT_H */
17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_journal_iter.h" #include "btree_node_scan.h" #include "btree_update_interior.h" #include "buckets.h" #include "error.h" #include "journal_io.h" #include "recovery_passes.h" #include <linux/kthread.h> #include <linux/sort.h> struct find_btree_nodes_worker { struct closure *cl; struct find_btree_nodes *f; struct bch_dev *ca; }; static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) { prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->journal_seq, n->cookie); bch2_bpos_to_text(out, n->min_key); prt_str(out, "-"); bch2_bpos_to_text(out, n->max_key); if (n->range_updated) prt_str(out, " range updated"); if (n->overwritten) prt_str(out, " overwritten"); for (unsigned i = 0; i < n->nr_ptrs; i++) { prt_char(out, ' '); bch2_extent_ptr_to_text(out, c, n->ptrs + i); } } static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) { printbuf_indent_add(out, 2); darray_for_each(nodes, i) { found_btree_node_to_text(out, c, i); prt_newline(out); } printbuf_indent_sub(out, 2); } static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) { struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); bp->k.p = f->max_key; bp->v.seq = cpu_to_le64(f->cookie); bp->v.sectors_written = 0; bp->v.flags = 0; bp->v.sectors_written = cpu_to_le16(f->sectors_written); bp->v.min_key = f->min_key; SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); } static inline u64 bkey_journal_seq(struct bkey_s_c k) { switch (k.k->type) { case KEY_TYPE_inode_v3: return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); default: return 0; } } static bool found_btree_node_is_readable(struct btree_trans *trans, struct found_btree_node *f) { struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; found_btree_node_to_key(&tmp.k, f); struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); bool ret = !IS_ERR_OR_NULL(b); if (!ret) return ret; f->sectors_written = b->written; f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); struct bkey_s_c k; struct bkey unpacked; struct btree_node_iter iter; for_each_btree_node_key_unpack(b, k, &iter, &unpacked) f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); six_unlock_read(&b->c.lock); /* * We might update this node's range; if that happens, we need the node * to be re-read so the read path can trim keys that are no longer in * this node */ if (b != btree_node_root(trans->c, b)) bch2_btree_node_evict(trans, &tmp.k); return ret; } static int found_btree_node_cmp_cookie(const void *_l, const void *_r) { const struct found_btree_node *l = _l; const struct found_btree_node *r = _r; return cmp_int(l->btree_id, r->btree_id) ?: cmp_int(l->level, r->level) ?: cmp_int(l->cookie, r->cookie); } /* * Given two found btree nodes, if their sequence numbers are equal, take the * one that's readable: */ static int found_btree_node_cmp_time(const struct found_btree_node *l, const struct found_btree_node *r) { return cmp_int(l->seq, r->seq) ?: cmp_int(l->journal_seq, r->journal_seq); } static int found_btree_node_cmp_pos(const void *_l, const void *_r) { const struct found_btree_node *l = _l; const struct found_btree_node *r = _r; return cmp_int(l->btree_id, r->btree_id) ?: -cmp_int(l->level, r->level) ?: bpos_cmp(l->min_key, r->min_key) ?: -found_btree_node_cmp_time(l, r); } static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, struct bio *bio, struct btree_node *bn, u64 offset) { struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, bn, PAGE_SIZE); submit_bio_wait(bio); if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, "IO error in try_read_btree_node() at %llu: %s", offset, bch2_blk_status_to_str(bio->bi_status))) return; if (le64_to_cpu(bn->magic) != bset_magic(c)) return; if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { struct nonce nonce = btree_nonce(&bn->keys, 0); unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); } if (btree_id_is_alloc(BTREE_NODE_ID(bn))) return; if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) return; rcu_read_lock(); struct found_btree_node n = { .btree_id = BTREE_NODE_ID(bn), .level = BTREE_NODE_LEVEL(bn), .seq = BTREE_NODE_SEQ(bn), .cookie = le64_to_cpu(bn->keys.seq), .min_key = bn->min_key, .max_key = bn->max_key, .nr_ptrs = 1, .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, .ptrs[0].offset = offset, .ptrs[0].dev = ca->dev_idx, .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), }; rcu_read_unlock(); if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { mutex_lock(&f->lock); if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { bch_err(c, "try_read_btree_node() can't handle endian conversion"); f->ret = -EINVAL; goto unlock; } if (darray_push(&f->nodes, n)) f->ret = -ENOMEM; unlock: mutex_unlock(&f->lock); } } static int read_btree_nodes_worker(void *p) { struct find_btree_nodes_worker *w = p; struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); struct bch_dev *ca = w->ca; void *buf = (void *) __get_free_page(GFP_KERNEL); struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); unsigned long last_print = jiffies; if (!buf || !bio) { bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); w->f->ret = -ENOMEM; goto err; } for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) for (unsigned bucket_offset = 0; bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; bucket_offset += btree_sectors(c)) { if (time_after(jiffies, last_print + HZ * 30)) { u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; bch_info(ca, "%s: %2u%% done", __func__, (unsigned) div64_u64(cur_sector * 100, end_sector)); last_print = jiffies; } u64 sector = bucket * ca->mi.bucket_size + bucket_offset; if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) continue; try_read_btree_node(w->f, ca, bio, buf, sector); } err: bio_put(bio); free_page((unsigned long) buf); percpu_ref_get(&ca->io_ref); closure_put(w->cl); kfree(w); return 0; } static int read_btree_nodes(struct find_btree_nodes *f) { struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); struct closure cl; int ret = 0; closure_init_stack(&cl); for_each_online_member(c, ca) { if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); struct task_struct *t; if (!w) { percpu_ref_put(&ca->io_ref); ret = -ENOMEM; goto err; } percpu_ref_get(&ca->io_ref); closure_get(&cl); w->cl = &cl; w->f = f; w->ca = ca; t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { percpu_ref_put(&ca->io_ref); closure_put(&cl); f->ret = ret; bch_err(c, "error starting kthread: %i", ret); break; } } err: closure_sync(&cl); return f->ret ?: ret; } static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) { while (n + 1 < end && found_btree_node_cmp_pos(n, n + 1) > 0) { swap(n[0], n[1]); n++; } } static int handle_overwrites(struct bch_fs *c, struct found_btree_node *start, struct found_btree_node *end) { struct found_btree_node *n; again: for (n = start + 1; n < end && n->btree_id == start->btree_id && n->level == start->level && bpos_lt(n->min_key, start->max_key); n++) { int cmp = found_btree_node_cmp_time(start, n); if (cmp > 0) { if (bpos_cmp(start->max_key, n->max_key) >= 0) n->overwritten = true; else { n->range_updated = true; n->min_key = bpos_successor(start->max_key); n->range_updated = true; bubble_up(n, end); goto again; } } else if (cmp < 0) { BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); start->max_key = bpos_predecessor(n->min_key); start->range_updated = true; } else if (n->level) { n->overwritten = true; } else { if (bpos_cmp(start->max_key, n->max_key) >= 0) n->overwritten = true; else { n->range_updated = true; n->min_key = bpos_successor(start->max_key); n->range_updated = true; bubble_up(n, end); goto again; } } } return 0; } int bch2_scan_for_btree_nodes(struct bch_fs *c) { struct find_btree_nodes *f = &c->found_btree_nodes; struct printbuf buf = PRINTBUF; size_t dst; int ret = 0; if (f->nodes.nr) return 0; mutex_init(&f->lock); ret = read_btree_nodes(f); if (ret) return ret; if (!f->nodes.nr) { bch_err(c, "%s: no btree nodes found", __func__); ret = -EINVAL; goto err; } if (0 && c->opts.verbose) { printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); bch2_print_string_as_lines(KERN_INFO, buf.buf); } sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); dst = 0; darray_for_each(f->nodes, i) { struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; if (prev && prev->cookie == i->cookie) { if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { bch_err(c, "%s: found too many replicas for btree node", __func__); ret = -EINVAL; goto err; } prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; } else { f->nodes.data[dst++] = *i; } } f->nodes.nr = dst; sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); if (0 && c->opts.verbose) { printbuf_reset(&buf); prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); bch2_print_string_as_lines(KERN_INFO, buf.buf); } dst = 0; darray_for_each(f->nodes, i) { if (i->overwritten) continue; ret = handle_overwrites(c, i, &darray_top(f->nodes)); if (ret) goto err; BUG_ON(i->overwritten); f->nodes.data[dst++] = *i; } f->nodes.nr = dst; if (c->opts.verbose) { printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); bch2_print_string_as_lines(KERN_INFO, buf.buf); } eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); err: printbuf_exit(&buf); return ret; } static int found_btree_node_range_start_cmp(const void *_l, const void *_r) { const struct found_btree_node *l = _l; const struct found_btree_node *r = _r; return cmp_int(l->btree_id, r->btree_id) ?: -cmp_int(l->level, r->level) ?: bpos_cmp(l->max_key, r->min_key); } #define for_each_found_btree_node_in_range(_f, _search, _idx) \ for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ sizeof((_f)->nodes.data[0]), \ found_btree_node_range_start_cmp, &search); \ _idx < (_f)->nodes.nr && \ (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ (_f)->nodes.data[_idx].level == _search.level && \ bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) { struct find_btree_nodes *f = &c->found_btree_nodes; struct found_btree_node search = { .btree_id = b->c.btree_id, .level = b->c.level, .min_key = b->data->min_key, .max_key = b->key.k.p, }; for_each_found_btree_node_in_range(f, search, idx) if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) return true; return false; } bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) { struct found_btree_node search = { .btree_id = btree, .level = 0, .min_key = POS_MIN, .max_key = SPOS_MAX, }; for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) return true; return false; } int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos node_min, struct bpos node_max) { if (btree_id_is_alloc(btree)) return 0; struct find_btree_nodes *f = &c->found_btree_nodes; int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); if (ret) return ret; if (c->opts.verbose) { struct printbuf buf = PRINTBUF; prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); bch2_bpos_to_text(&buf, node_min); prt_str(&buf, " - "); bch2_bpos_to_text(&buf, node_max); bch_info(c, "%s(): %s", __func__, buf.buf); printbuf_exit(&buf); } struct found_btree_node search = { .btree_id = btree, .level = level, .min_key = node_min, .max_key = node_max, }; for_each_found_btree_node_in_range(f, search, idx) { struct found_btree_node n = f->nodes.data[idx]; n.range_updated |= bpos_lt(n.min_key, node_min); n.min_key = bpos_max(n.min_key, node_min); n.range_updated |= bpos_gt(n.max_key, node_max); n.max_key = bpos_min(n.max_key, node_max); struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; found_btree_node_to_key(&tmp.k, &n); struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); printbuf_exit(&buf); BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); if (ret) return ret; } return 0; } void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) { darray_exit(&f->nodes); }
21 21 21 21 21 21 21 21 21 21 21 21 31 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 /* +++ deflate.c */ /* deflate.c -- compress data using the deflation algorithm * Copyright (C) 1995-1996 Jean-loup Gailly. * For conditions of distribution and use, see copyright notice in zlib.h */ /* * ALGORITHM * * The "deflation" process depends on being able to identify portions * of the input text which are identical to earlier input (within a * sliding window trailing behind the input currently being processed). * * The most straightforward technique turns out to be the fastest for * most input files: try all possible matches and select the longest. * The key feature of this algorithm is that insertions into the string * dictionary are very simple and thus fast, and deletions are avoided * completely. Insertions are performed at each input character, whereas * string matches are performed only when the previous match ends. So it * is preferable to spend more time in matches to allow very fast string * insertions and avoid deletions. The matching algorithm for small * strings is inspired from that of Rabin & Karp. A brute force approach * is used to find longer strings when a small match has been found. * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze * (by Leonid Broukhis). * A previous version of this file used a more sophisticated algorithm * (by Fiala and Greene) which is guaranteed to run in linear amortized * time, but has a larger average cost, uses more memory and is patented. * However the F&G algorithm may be faster for some highly redundant * files if the parameter max_chain_length (described below) is too large. * * ACKNOWLEDGEMENTS * * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and * I found it in 'freeze' written by Leonid Broukhis. * Thanks to many people for bug reports and testing. * * REFERENCES * * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification". * Available in ftp://ds.internic.net/rfc/rfc1951.txt * * A description of the Rabin and Karp algorithm is given in the book * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. * * Fiala,E.R., and Greene,D.H. * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 * */ #include <linux/module.h> #include <linux/zutil.h> #include "defutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_deflate.h" #else #define DEFLATE_RESET_HOOK(strm) do {} while (0) #define DEFLATE_HOOK(strm, flush, bstate) 0 #define DEFLATE_NEED_CHECKSUM(strm) 1 #define DEFLATE_DFLTCC_ENABLED() 0 #endif /* =========================================================================== * Function prototypes. */ typedef block_state (*compress_func) (deflate_state *s, int flush); /* Compression function. Returns the block state after the call. */ static void fill_window (deflate_state *s); static block_state deflate_stored (deflate_state *s, int flush); static block_state deflate_fast (deflate_state *s, int flush); static block_state deflate_slow (deflate_state *s, int flush); static void lm_init (deflate_state *s); static void putShortMSB (deflate_state *s, uInt b); static int read_buf (z_streamp strm, Byte *buf, unsigned size); static uInt longest_match (deflate_state *s, IPos cur_match); #ifdef DEBUG_ZLIB static void check_match (deflate_state *s, IPos start, IPos match, int length); #endif /* =========================================================================== * Local data */ #define NIL 0 /* Tail of hash chains */ #ifndef TOO_FAR # define TOO_FAR 4096 #endif /* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) /* Minimum amount of lookahead, except at the end of the input file. * See deflate.c for comments about the MIN_MATCH+1. */ /* Workspace to be allocated for deflate processing */ typedef struct deflate_workspace { /* State memory for the deflator */ deflate_state deflate_memory; #ifdef CONFIG_ZLIB_DFLTCC /* State memory for s390 hardware deflate */ struct dfltcc_deflate_state dfltcc_memory; #endif Byte *window_memory; Pos *prev_memory; Pos *head_memory; char *overlay_memory; } deflate_workspace; #ifdef CONFIG_ZLIB_DFLTCC /* dfltcc_state must be doubleword aligned for DFLTCC call */ static_assert(offsetof(struct deflate_workspace, dfltcc_memory) % 8 == 0); #endif /* Values for max_lazy_match, good_match and max_chain_length, depending on * the desired pack level (0..9). The values given below have been tuned to * exclude worst case performance for pathological files. Better values may be * found for specific files. */ typedef struct config_s { ush good_length; /* reduce lazy search above this match length */ ush max_lazy; /* do not perform lazy search above this match length */ ush nice_length; /* quit search above this match length */ ush max_chain; compress_func func; } config; static const config configuration_table[10] = { /* good lazy nice chain */ /* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ /* 1 */ {4, 4, 8, 4, deflate_fast}, /* maximum speed, no lazy matches */ /* 2 */ {4, 5, 16, 8, deflate_fast}, /* 3 */ {4, 6, 32, 32, deflate_fast}, /* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */ /* 5 */ {8, 16, 32, 32, deflate_slow}, /* 6 */ {8, 16, 128, 128, deflate_slow}, /* 7 */ {8, 32, 128, 256, deflate_slow}, /* 8 */ {32, 128, 258, 1024, deflate_slow}, /* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* maximum compression */ /* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 * For deflate_fast() (levels <= 3) good is ignored and lazy has a different * meaning. */ #define EQUAL 0 /* result of memcmp for equal strings */ /* =========================================================================== * Update a hash value with the given input byte * IN assertion: all calls to UPDATE_HASH are made with consecutive * input characters, so that a running hash key can be computed from the * previous key instead of complete recalculation each time. */ #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask) /* =========================================================================== * Insert string str in the dictionary and set match_head to the previous head * of the hash chain (the most recent string with same hash key). Return * the previous length of the hash chain. * IN assertion: all calls to INSERT_STRING are made with consecutive * input characters and the first MIN_MATCH bytes of str are valid * (except for the last MIN_MATCH-1 bytes of the input file). */ #define INSERT_STRING(s, str, match_head) \ (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ s->prev[(str) & s->w_mask] = match_head = s->head[s->ins_h], \ s->head[s->ins_h] = (Pos)(str)) /* =========================================================================== * Initialize the hash table (avoiding 64K overflow for 16 bit systems). * prev[] will be initialized on the fly. */ #define CLEAR_HASH(s) \ s->head[s->hash_size-1] = NIL; \ memset((char *)s->head, 0, (unsigned)(s->hash_size-1)*sizeof(*s->head)); /* ========================================================================= */ int zlib_deflateInit2( z_streamp strm, int level, int method, int windowBits, int memLevel, int strategy ) { deflate_state *s; int noheader = 0; deflate_workspace *mem; char *next; ush *overlay; /* We overlay pending_buf and d_buf+l_buf. This works since the average * output size for (length,distance) codes is <= 24 bits. */ if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; if (level == Z_DEFAULT_COMPRESSION) level = 6; mem = (deflate_workspace *) strm->workspace; if (windowBits < 0) { /* undocumented feature: suppress zlib header */ noheader = 1; windowBits = -windowBits; } if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || windowBits < 9 || windowBits > 15 || level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { return Z_STREAM_ERROR; } /* * Direct the workspace's pointers to the chunks that were allocated * along with the deflate_workspace struct. */ next = (char *) mem; next += sizeof(*mem); #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ mem->window_memory = (Byte *) PTR_ALIGN(next, PAGE_SIZE); #else mem->window_memory = (Byte *) next; #endif next += zlib_deflate_window_memsize(windowBits); mem->prev_memory = (Pos *) next; next += zlib_deflate_prev_memsize(windowBits); mem->head_memory = (Pos *) next; next += zlib_deflate_head_memsize(memLevel); mem->overlay_memory = next; s = (deflate_state *) &(mem->deflate_memory); strm->state = (struct internal_state *)s; s->strm = strm; s->noheader = noheader; s->w_bits = windowBits; s->w_size = 1 << s->w_bits; s->w_mask = s->w_size - 1; s->hash_bits = memLevel + 7; s->hash_size = 1 << s->hash_bits; s->hash_mask = s->hash_size - 1; s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); s->window = (Byte *) mem->window_memory; s->prev = (Pos *) mem->prev_memory; s->head = (Pos *) mem->head_memory; s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ overlay = (ush *) mem->overlay_memory; s->pending_buf = (uch *) overlay; s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); s->d_buf = overlay + s->lit_bufsize/sizeof(ush); s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; s->level = level; s->strategy = strategy; s->method = (Byte)method; return zlib_deflateReset(strm); } /* ========================================================================= */ int zlib_deflateReset( z_streamp strm ) { deflate_state *s; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; strm->total_in = strm->total_out = 0; strm->msg = NULL; strm->data_type = Z_UNKNOWN; s = (deflate_state *)strm->state; s->pending = 0; s->pending_out = s->pending_buf; if (s->noheader < 0) { s->noheader = 0; /* was set to -1 by deflate(..., Z_FINISH); */ } s->status = s->noheader ? BUSY_STATE : INIT_STATE; strm->adler = 1; s->last_flush = Z_NO_FLUSH; zlib_tr_init(s); lm_init(s); DEFLATE_RESET_HOOK(strm); return Z_OK; } /* ========================================================================= * Put a short in the pending buffer. The 16-bit value is put in MSB order. * IN assertion: the stream state is correct and there is enough room in * pending_buf. */ static void putShortMSB( deflate_state *s, uInt b ) { put_byte(s, (Byte)(b >> 8)); put_byte(s, (Byte)(b & 0xff)); } /* ========================================================================= */ int zlib_deflate( z_streamp strm, int flush ) { int old_flush; /* value of flush param for previous deflate call */ deflate_state *s; if (strm == NULL || strm->state == NULL || flush > Z_FINISH || flush < 0) { return Z_STREAM_ERROR; } s = (deflate_state *) strm->state; if ((strm->next_in == NULL && strm->avail_in != 0) || (s->status == FINISH_STATE && flush != Z_FINISH)) { return Z_STREAM_ERROR; } if (strm->avail_out == 0) return Z_BUF_ERROR; s->strm = strm; /* just in case */ old_flush = s->last_flush; s->last_flush = flush; /* Write the zlib header */ if (s->status == INIT_STATE) { uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8; uInt level_flags = (s->level-1) >> 1; if (level_flags > 3) level_flags = 3; header |= (level_flags << 6); if (s->strstart != 0) header |= PRESET_DICT; header += 31 - (header % 31); s->status = BUSY_STATE; putShortMSB(s, header); /* Save the adler32 of the preset dictionary: */ if (s->strstart != 0) { putShortMSB(s, (uInt)(strm->adler >> 16)); putShortMSB(s, (uInt)(strm->adler & 0xffff)); } strm->adler = 1L; } /* Flush as much pending output as possible */ if (s->pending != 0) { flush_pending(strm); if (strm->avail_out == 0) { /* Since avail_out is 0, deflate will be called again with * more output space, but possibly with both pending and * avail_in equal to zero. There won't be anything to do, * but this is not an error situation so make sure we * return OK instead of BUF_ERROR at next call of deflate: */ s->last_flush = -1; return Z_OK; } /* Make sure there is something to do and avoid duplicate consecutive * flushes. For repeated and useless calls with Z_FINISH, we keep * returning Z_STREAM_END instead of Z_BUFF_ERROR. */ } else if (strm->avail_in == 0 && flush <= old_flush && flush != Z_FINISH) { return Z_BUF_ERROR; } /* User must not provide more input after the first FINISH: */ if (s->status == FINISH_STATE && strm->avail_in != 0) { return Z_BUF_ERROR; } /* Start a new block or continue the current one. */ if (strm->avail_in != 0 || s->lookahead != 0 || (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { block_state bstate; bstate = DEFLATE_HOOK(strm, flush, &bstate) ? bstate : (*(configuration_table[s->level].func))(s, flush); if (bstate == finish_started || bstate == finish_done) { s->status = FINISH_STATE; } if (bstate == need_more || bstate == finish_started) { if (strm->avail_out == 0) { s->last_flush = -1; /* avoid BUF_ERROR next call, see above */ } return Z_OK; /* If flush != Z_NO_FLUSH && avail_out == 0, the next call * of deflate should use the same flush parameter to make sure * that the flush is complete. So we don't have to output an * empty block here, this will be done at next call. This also * ensures that for a very small output buffer, we emit at most * one empty block. */ } if (bstate == block_done) { if (flush == Z_PARTIAL_FLUSH) { zlib_tr_align(s); } else if (flush == Z_PACKET_FLUSH) { /* Output just the 3-bit `stored' block type value, but not a zero length. */ zlib_tr_stored_type_only(s); } else { /* FULL_FLUSH or SYNC_FLUSH */ zlib_tr_stored_block(s, (char*)0, 0L, 0); /* For a full flush, this empty block will be recognized * as a special marker by inflate_sync(). */ if (flush == Z_FULL_FLUSH) { CLEAR_HASH(s); /* forget history */ } } flush_pending(strm); if (strm->avail_out == 0) { s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */ return Z_OK; } } } Assert(strm->avail_out > 0, "bug2"); if (flush != Z_FINISH) return Z_OK; if (!s->noheader) { /* Write zlib trailer (adler32) */ putShortMSB(s, (uInt)(strm->adler >> 16)); putShortMSB(s, (uInt)(strm->adler & 0xffff)); } flush_pending(strm); /* If avail_out is zero, the application will call deflate again * to flush the rest. */ if (!s->noheader) { s->noheader = -1; /* write the trailer only once! */ } if (s->pending == 0) { Assert(s->bi_valid == 0, "bi_buf not flushed"); return Z_STREAM_END; } return Z_OK; } /* ========================================================================= */ int zlib_deflateEnd( z_streamp strm ) { int status; deflate_state *s; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; s = (deflate_state *) strm->state; status = s->status; if (status != INIT_STATE && status != BUSY_STATE && status != FINISH_STATE) { return Z_STREAM_ERROR; } strm->state = NULL; return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } /* =========================================================================== * Read a new buffer from the current input stream, update the adler32 * and total number of bytes read. All deflate() input goes through * this function so some applications may wish to modify it to avoid * allocating a large strm->next_in buffer and copying from it. * (See also flush_pending()). */ static int read_buf( z_streamp strm, Byte *buf, unsigned size ) { unsigned len = strm->avail_in; if (len > size) len = size; if (len == 0) return 0; strm->avail_in -= len; if (!DEFLATE_NEED_CHECKSUM(strm)) {} else if (!((deflate_state *)(strm->state))->noheader) { strm->adler = zlib_adler32(strm->adler, strm->next_in, len); } memcpy(buf, strm->next_in, len); strm->next_in += len; strm->total_in += len; return (int)len; } /* =========================================================================== * Initialize the "longest match" routines for a new zlib stream */ static void lm_init( deflate_state *s ) { s->window_size = (ulg)2L*s->w_size; CLEAR_HASH(s); /* Set the default configuration parameters: */ s->max_lazy_match = configuration_table[s->level].max_lazy; s->good_match = configuration_table[s->level].good_length; s->nice_match = configuration_table[s->level].nice_length; s->max_chain_length = configuration_table[s->level].max_chain; s->strstart = 0; s->block_start = 0L; s->lookahead = 0; s->match_length = s->prev_length = MIN_MATCH-1; s->match_available = 0; s->ins_h = 0; } /* =========================================================================== * Set match_start to the longest match starting at the given string and * return its length. Matches shorter or equal to prev_length are discarded, * in which case the result is equal to prev_length and match_start is * garbage. * IN assertions: cur_match is the head of the hash chain for the current * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 * OUT assertion: the match length is not greater than s->lookahead. */ /* For 80x86 and 680x0, an optimized version will be provided in match.asm or * match.S. The code will be functionally equivalent. */ static uInt longest_match( deflate_state *s, IPos cur_match /* current match */ ) { unsigned chain_length = s->max_chain_length;/* max hash chain length */ register Byte *scan = s->window + s->strstart; /* current string */ register Byte *match; /* matched string */ register int len; /* length of current match */ int best_len = s->prev_length; /* best match length so far */ int nice_match = s->nice_match; /* stop if match long enough */ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? s->strstart - (IPos)MAX_DIST(s) : NIL; /* Stop when cur_match becomes <= limit. To simplify the code, * we prevent matches with the string of window index 0. */ Pos *prev = s->prev; uInt wmask = s->w_mask; #ifdef UNALIGNED_OK /* Compare two bytes at a time. Note: this is not always beneficial. * Try with and without -DUNALIGNED_OK to check. */ register Byte *strend = s->window + s->strstart + MAX_MATCH - 1; register ush scan_start = *(ush*)scan; register ush scan_end = *(ush*)(scan+best_len-1); #else register Byte *strend = s->window + s->strstart + MAX_MATCH; register Byte scan_end1 = scan[best_len-1]; register Byte scan_end = scan[best_len]; #endif /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. * It is easy to get rid of this optimization if necessary. */ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); /* Do not waste too much time if we already have a good match: */ if (s->prev_length >= s->good_match) { chain_length >>= 2; } /* Do not look for matches beyond the end of the input. This is necessary * to make deflate deterministic. */ if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); do { Assert(cur_match < s->strstart, "no future"); match = s->window + cur_match; /* Skip to next match if the match length cannot increase * or if the match length is less than 2: */ #if (defined(UNALIGNED_OK) && MAX_MATCH == 258) /* This code assumes sizeof(unsigned short) == 2. Do not use * UNALIGNED_OK if your compiler uses a different size. */ if (*(ush*)(match+best_len-1) != scan_end || *(ush*)match != scan_start) continue; /* It is not necessary to compare scan[2] and match[2] since they are * always equal when the other bytes match, given that the hash keys * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at * strstart+3, +5, ... up to strstart+257. We check for insufficient * lookahead only every 4th comparison; the 128th check will be made * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is * necessary to put more guard bytes at the end of the window, or * to check more often for insufficient lookahead. */ Assert(scan[2] == match[2], "scan[2]?"); scan++, match++; do { } while (*(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && scan < strend); /* The funny "do {}" generates better code on most compilers */ /* Here, scan <= window+strstart+257 */ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); if (*scan == *match) scan++; len = (MAX_MATCH - 1) - (int)(strend-scan); scan = strend - (MAX_MATCH-1); #else /* UNALIGNED_OK */ if (match[best_len] != scan_end || match[best_len-1] != scan_end1 || *match != *scan || *++match != scan[1]) continue; /* The check at best_len-1 can be removed because it will be made * again later. (This heuristic is not always a win.) * It is not necessary to compare scan[2] and match[2] since they * are always equal when the other bytes match, given that * the hash keys are equal and that HASH_BITS >= 8. */ scan += 2, match++; Assert(*scan == *match, "match[2]?"); /* We check for insufficient lookahead only every 8th comparison; * the 256th check will be made at strstart+258. */ do { } while (*++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && scan < strend); Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); len = MAX_MATCH - (int)(strend - scan); scan = strend - MAX_MATCH; #endif /* UNALIGNED_OK */ if (len > best_len) { s->match_start = cur_match; best_len = len; if (len >= nice_match) break; #ifdef UNALIGNED_OK scan_end = *(ush*)(scan+best_len-1); #else scan_end1 = scan[best_len-1]; scan_end = scan[best_len]; #endif } } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); if ((uInt)best_len <= s->lookahead) return best_len; return s->lookahead; } #ifdef DEBUG_ZLIB /* =========================================================================== * Check that the match at match_start is indeed a match. */ static void check_match( deflate_state *s, IPos start, IPos match, int length ) { /* check that the match is indeed a match */ if (memcmp((char *)s->window + match, (char *)s->window + start, length) != EQUAL) { fprintf(stderr, " start %u, match %u, length %d\n", start, match, length); do { fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); } while (--length != 0); z_error("invalid match"); } if (z_verbose > 1) { fprintf(stderr,"\\[%d,%d]", start-match, length); do { putc(s->window[start++], stderr); } while (--length != 0); } } #else # define check_match(s, start, match, length) #endif /* =========================================================================== * Fill the window when the lookahead becomes insufficient. * Updates strstart and lookahead. * * IN assertion: lookahead < MIN_LOOKAHEAD * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD * At least one byte has been read, or avail_in == 0; reads are * performed for at least two bytes (required for the zip translate_eol * option -- not supported here). */ static void fill_window( deflate_state *s ) { register unsigned n, m; register Pos *p; unsigned more; /* Amount of free space at the end of the window. */ uInt wsize = s->w_size; do { more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); /* Deal with !@#$% 64K limit: */ if (more == 0 && s->strstart == 0 && s->lookahead == 0) { more = wsize; } else if (more == (unsigned)(-1)) { /* Very unlikely, but possible on 16 bit machine if strstart == 0 * and lookahead == 1 (input done one byte at time) */ more--; /* If the window is almost full and there is insufficient lookahead, * move the upper half to the lower one to make room in the upper half. */ } else if (s->strstart >= wsize+MAX_DIST(s)) { memcpy((char *)s->window, (char *)s->window+wsize, (unsigned)wsize); s->match_start -= wsize; s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ s->block_start -= (long) wsize; /* Slide the hash table (could be avoided with 32 bit values at the expense of memory usage). We slide even when level == 0 to keep the hash table consistent if we switch back to level > 0 later. (Using level 0 permanently is not an optimal usage of zlib, so we don't care about this pathological case.) */ n = s->hash_size; p = &s->head[n]; do { m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); } while (--n); n = wsize; p = &s->prev[n]; do { m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); /* If n is not on any hash chain, prev[n] is garbage but * its value will never be used. */ } while (--n); more += wsize; } if (s->strm->avail_in == 0) return; /* If there was no sliding: * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && * more == window_size - lookahead - strstart * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) * => more >= window_size - 2*WSIZE + 2 * In the BIG_MEM or MMAP case (not yet supported), * window_size == input_size + MIN_LOOKAHEAD && * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. * Otherwise, window_size == 2*WSIZE so more >= 2. * If there was sliding, more >= WSIZE. So in all cases, more >= 2. */ Assert(more >= 2, "more < 2"); n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); s->lookahead += n; /* Initialize the hash value now that we have some input: */ if (s->lookahead >= MIN_MATCH) { s->ins_h = s->window[s->strstart]; UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif } /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, * but this is not important since only literal bytes will be emitted. */ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); } /* =========================================================================== * Flush the current block, with given end-of-file flag. * IN assertion: strstart is set to the end of the current match. */ #define FLUSH_BLOCK_ONLY(s, eof) { \ zlib_tr_flush_block(s, (s->block_start >= 0L ? \ (char *)&s->window[(unsigned)s->block_start] : \ NULL), \ (ulg)((long)s->strstart - s->block_start), \ (eof)); \ s->block_start = s->strstart; \ flush_pending(s->strm); \ Tracev((stderr,"[FLUSH]")); \ } /* Same but force premature exit if necessary. */ #define FLUSH_BLOCK(s, eof) { \ FLUSH_BLOCK_ONLY(s, eof); \ if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \ } /* =========================================================================== * Copy without compression as much as possible from the input stream, return * the current block state. * This function does not insert new strings in the dictionary since * uncompressible data is probably not useful. This function is used * only for the level=0 compression option. * NOTE: this function should be optimized to avoid extra copying from * window to pending_buf. */ static block_state deflate_stored( deflate_state *s, int flush ) { /* Stored blocks are limited to 0xffff bytes, pending_buf is limited * to pending_buf_size, and each stored block has a 5 byte header: */ ulg max_block_size = 0xffff; ulg max_start; if (max_block_size > s->pending_buf_size - 5) { max_block_size = s->pending_buf_size - 5; } /* Copy as much as possible from input to output: */ for (;;) { /* Fill the window as much as possible: */ if (s->lookahead <= 1) { Assert(s->strstart < s->w_size+MAX_DIST(s) || s->block_start >= (long)s->w_size, "slide too late"); fill_window(s); if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; if (s->lookahead == 0) break; /* flush the current block */ } Assert(s->block_start >= 0L, "block gone"); s->strstart += s->lookahead; s->lookahead = 0; /* Emit a stored block if pending_buf will be full: */ max_start = s->block_start + max_block_size; if (s->strstart == 0 || (ulg)s->strstart >= max_start) { /* strstart == 0 is possible when wraparound on 16-bit machine */ s->lookahead = (uInt)(s->strstart - max_start); s->strstart = (uInt)max_start; FLUSH_BLOCK(s, 0); } /* Flush if we may have to slide, otherwise block_start may become * negative and the data will be gone: */ if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { FLUSH_BLOCK(s, 0); } } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } /* =========================================================================== * Compress as much as possible from the input stream, return the current * block state. * This function does not perform lazy evaluation of matches and inserts * new strings in the dictionary only for unmatched strings or for short * matches. It is used only for the fast compression options. */ static block_state deflate_fast( deflate_state *s, int flush ) { IPos hash_head = NIL; /* head of the hash chain */ int bflush; /* set if current block must be flushed */ for (;;) { /* Make sure that we always have enough lookahead, except * at the end of the input file. We need MAX_MATCH bytes * for the next match, plus MIN_MATCH bytes to insert the * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } if (s->lookahead == 0) break; /* flush the current block */ } /* Insert the string window[strstart .. strstart+2] in the * dictionary, and set hash_head to the head of the hash chain: */ if (s->lookahead >= MIN_MATCH) { INSERT_STRING(s, s->strstart, hash_head); } /* Find the longest match, discarding those <= prev_length. * At this point we have always match_length < MIN_MATCH */ if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) { /* To simplify the code, we prevent matches with the string * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ if (s->strategy != Z_HUFFMAN_ONLY) { s->match_length = longest_match (s, hash_head); } /* longest_match() sets match_start */ } if (s->match_length >= MIN_MATCH) { check_match(s, s->strstart, s->match_start, s->match_length); bflush = zlib_tr_tally(s, s->strstart - s->match_start, s->match_length - MIN_MATCH); s->lookahead -= s->match_length; /* Insert new strings in the hash table only if the match length * is not too large. This saves time but degrades compression. */ if (s->match_length <= s->max_insert_length && s->lookahead >= MIN_MATCH) { s->match_length--; /* string at strstart already in hash table */ do { s->strstart++; INSERT_STRING(s, s->strstart, hash_head); /* strstart never exceeds WSIZE-MAX_MATCH, so there are * always MIN_MATCH bytes ahead. */ } while (--s->match_length != 0); s->strstart++; } else { s->strstart += s->match_length; s->match_length = 0; s->ins_h = s->window[s->strstart]; UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not * matter since it will be recomputed at next deflate call. */ } } else { /* No match, output a literal byte */ Tracevv((stderr,"%c", s->window[s->strstart])); bflush = zlib_tr_tally (s, 0, s->window[s->strstart]); s->lookahead--; s->strstart++; } if (bflush) FLUSH_BLOCK(s, 0); } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } /* =========================================================================== * Same as above, but achieves better compression. We use a lazy * evaluation for matches: a match is finally adopted only if there is * no better match at the next window position. */ static block_state deflate_slow( deflate_state *s, int flush ) { IPos hash_head = NIL; /* head of hash chain */ int bflush; /* set if current block must be flushed */ /* Process the input block. */ for (;;) { /* Make sure that we always have enough lookahead, except * at the end of the input file. We need MAX_MATCH bytes * for the next match, plus MIN_MATCH bytes to insert the * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } if (s->lookahead == 0) break; /* flush the current block */ } /* Insert the string window[strstart .. strstart+2] in the * dictionary, and set hash_head to the head of the hash chain: */ if (s->lookahead >= MIN_MATCH) { INSERT_STRING(s, s->strstart, hash_head); } /* Find the longest match, discarding those <= prev_length. */ s->prev_length = s->match_length, s->prev_match = s->match_start; s->match_length = MIN_MATCH-1; if (hash_head != NIL && s->prev_length < s->max_lazy_match && s->strstart - hash_head <= MAX_DIST(s)) { /* To simplify the code, we prevent matches with the string * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ if (s->strategy != Z_HUFFMAN_ONLY) { s->match_length = longest_match (s, hash_head); } /* longest_match() sets match_start */ if (s->match_length <= 5 && (s->strategy == Z_FILTERED || (s->match_length == MIN_MATCH && s->strstart - s->match_start > TOO_FAR))) { /* If prev_match is also MIN_MATCH, match_start is garbage * but we will ignore the current match anyway. */ s->match_length = MIN_MATCH-1; } } /* If there was a match at the previous step and the current * match is not better, output the previous match: */ if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) { uInt max_insert = s->strstart + s->lookahead - MIN_MATCH; /* Do not insert strings in hash table beyond this. */ check_match(s, s->strstart-1, s->prev_match, s->prev_length); bflush = zlib_tr_tally(s, s->strstart -1 - s->prev_match, s->prev_length - MIN_MATCH); /* Insert in hash table all strings up to the end of the match. * strstart-1 and strstart are already inserted. If there is not * enough lookahead, the last two strings are not inserted in * the hash table. */ s->lookahead -= s->prev_length-1; s->prev_length -= 2; do { if (++s->strstart <= max_insert) { INSERT_STRING(s, s->strstart, hash_head); } } while (--s->prev_length != 0); s->match_available = 0; s->match_length = MIN_MATCH-1; s->strstart++; if (bflush) FLUSH_BLOCK(s, 0); } else if (s->match_available) { /* If there was no match at the previous position, output a * single literal. If there was a match but the current match * is longer, truncate the previous match to a single literal. */ Tracevv((stderr,"%c", s->window[s->strstart-1])); if (zlib_tr_tally (s, 0, s->window[s->strstart-1])) { FLUSH_BLOCK_ONLY(s, 0); } s->strstart++; s->lookahead--; if (s->strm->avail_out == 0) return need_more; } else { /* There is no previous match to compare with, wait for * the next step to decide. */ s->match_available = 1; s->strstart++; s->lookahead--; } } Assert (flush != Z_NO_FLUSH, "no flush?"); if (s->match_available) { Tracevv((stderr,"%c", s->window[s->strstart-1])); zlib_tr_tally (s, 0, s->window[s->strstart-1]); s->match_available = 0; } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } int zlib_deflate_workspacesize(int windowBits, int memLevel) { if (windowBits < 0) /* undocumented feature: suppress zlib header */ windowBits = -windowBits; /* Since the return value is typically passed to vmalloc() unchecked... */ BUG_ON(memLevel < 1 || memLevel > MAX_MEM_LEVEL || windowBits < 9 || windowBits > 15); return sizeof(deflate_workspace) + zlib_deflate_window_memsize(windowBits) + zlib_deflate_prev_memsize(windowBits) + zlib_deflate_head_memsize(memLevel) + zlib_deflate_overlay_memsize(memLevel); } int zlib_deflate_dfltcc_enabled(void) { return DEFLATE_DFLTCC_ENABLED(); }
16 17 17 17 17 17 23 17 17 19 2 2 1 1 17 2 16 15 2 2 1 1 1 1 7 43 25 25 24 23 22 23 22 22 22 20 21 20 19 7 22 42 23 23 17 15 3 3 17 23 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ematch.c Extended Match API * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * An extended match (ematch) is a small classification tool not worth * writing a full classifier for. Ematches can be interconnected to form * a logic expression and get attached to classifiers to extend their * functionatlity. * * The userspace part transforms the logic expressions into an array * consisting of multiple sequences of interconnected ematches separated * by markers. Precedence is implemented by a special ematch kind * referencing a sequence beyond the marker of the current sequence * causing the current position in the sequence to be pushed onto a stack * to allow the current position to be overwritten by the position referenced * in the special ematch. Matching continues in the new sequence until a * marker is reached causing the position to be restored from the stack. * * Example: * A AND (B1 OR B2) AND C AND D * * ------->-PUSH------- * -->-- / -->-- \ -->-- * / \ / / \ \ / \ * +-------+-------+-------+-------+-------+--------+ * | A AND | B AND | C AND | D END | B1 OR | B2 END | * +-------+-------+-------+-------+-------+--------+ * \ / * --------<-POP--------- * * where B is a virtual ematch referencing to sequence starting with B1. * * ========================================================================== * * How to write an ematch in 60 seconds * ------------------------------------ * * 1) Provide a matcher function: * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, * struct tcf_pkt_info *info) * { * struct mydata *d = (struct mydata *) m->data; * * if (...matching goes here...) * return 1; * else * return 0; * } * * 2) Fill out a struct tcf_ematch_ops: * static struct tcf_ematch_ops my_ops = { * .kind = unique id, * .datalen = sizeof(struct mydata), * .match = my_match, * .owner = THIS_MODULE, * }; * * 3) Register/Unregister your ematch: * static int __init init_my_ematch(void) * { * return tcf_em_register(&my_ops); * } * * static void __exit exit_my_ematch(void) * { * tcf_em_unregister(&my_ops); * } * * module_init(init_my_ematch); * module_exit(exit_my_ematch); * * 4) By now you should have two more seconds left, barely enough to * open up a beer to watch the compilation going. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> static LIST_HEAD(ematch_ops); static DEFINE_RWLOCK(ematch_mod_lock); static struct tcf_ematch_ops *tcf_em_lookup(u16 kind) { struct tcf_ematch_ops *e = NULL; read_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) { if (kind == e->kind) { if (!try_module_get(e->owner)) e = NULL; read_unlock(&ematch_mod_lock); return e; } } read_unlock(&ematch_mod_lock); return NULL; } /** * tcf_em_register - register an extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their presence. * The given @ops must have kind set to a unique identifier and the * callback match() must be implemented. All other callbacks are optional * and a fallback implementation is used instead. * * Returns -EEXISTS if an ematch of the same kind has already registered. */ int tcf_em_register(struct tcf_ematch_ops *ops) { int err = -EEXIST; struct tcf_ematch_ops *e; if (ops->match == NULL) return -EINVAL; write_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) if (ops->kind == e->kind) goto errout; list_add_tail(&ops->link, &ematch_ops); err = 0; errout: write_unlock(&ematch_mod_lock); return err; } EXPORT_SYMBOL(tcf_em_register); /** * tcf_em_unregister - unregister and extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their disappearance * for examples when the module gets unloaded. The @ops parameter must be * the same as the one used for registration. * * Returns -ENOENT if no matching ematch was found. */ void tcf_em_unregister(struct tcf_ematch_ops *ops) { write_lock(&ematch_mod_lock); list_del(&ops->link); write_unlock(&ematch_mod_lock); } EXPORT_SYMBOL(tcf_em_unregister); static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, int index) { return &tree->matches[index]; } static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *tree_hdr, struct tcf_ematch *em, struct nlattr *nla, int idx) { int err = -EINVAL; struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); struct net *net = tp->chain->block->net; if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; if (em_hdr->kind == TCF_EM_CONTAINER) { /* Special ematch called "container", carries an index * referencing an external ematch sequence. */ u32 ref; if (data_len < sizeof(ref)) goto errout; ref = *(u32 *) data; if (ref >= tree_hdr->nmatches) goto errout; /* We do not allow backward jumps to avoid loops and jumps * to our own position are of course illegal. */ if (ref <= idx) goto errout; em->data = ref; } else { /* Note: This lookup will increase the module refcnt * of the ematch module referenced. In case of a failure, * a destroy function is called by the underlying layer * which automatically releases the reference again, therefore * the module MUST not be given back under any circumstances * here. Be aware, the destroy function assumes that the * module is held if the ops field is non zero. */ em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops == NULL) { err = -ENOENT; #ifdef CONFIG_MODULES __rtnl_unlock(); request_module("ematch-kind-%u", em_hdr->kind); rtnl_lock(); em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops) { /* We dropped the RTNL mutex in order to * perform the module load. Tell the caller * to replay the request. */ module_put(em->ops->owner); em->ops = NULL; err = -EAGAIN; } #endif goto errout; } /* ematch module provides expected length of data, so we * can do a basic sanity check. */ if (em->ops->datalen && data_len < em->ops->datalen) goto errout; if (em->ops->change) { err = -EINVAL; if (em_hdr->flags & TCF_EM_SIMPLE) goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; } else if (data_len > 0) { /* ematch module doesn't provide an own change * procedure and expects us to allocate and copy * the ematch data. * * TCF_EM_SIMPLE may be specified stating that the * data only consists of a u32 integer and the module * does not expected a memory reference but rather * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { if (em->ops->datalen > 0) goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; } else { void *v = kmemdup(data, data_len, GFP_KERNEL); if (v == NULL) { err = -ENOBUFS; goto errout; } em->data = (unsigned long) v; } em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; em->net = net; err = 0; errout: return err; } static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = { [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) }, [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED }, }; /** * tcf_em_tree_validate - validate ematch config TLV and build ematch tree * * @tp: classifier kind handle * @nla: ematch tree configuration TLV * @tree: destination ematch tree variable to store the resulting * ematch tree. * * This function validates the given configuration TLV @nla and builds an * ematch tree in @tree. The resulting tree must later be copied into * the private classifier data using tcf_em_tree_change(). You MUST NOT * provide the ematch tree variable of the private classifier data directly, * the changes would not be locked properly. * * Returns a negative error code if the configuration TLV contains errors. */ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, struct tcf_ematch_tree *tree) { int idx, list_len, matches_len, err; struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1]; struct nlattr *rt_match, *rt_hdr, *rt_list; struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; memset(tree, 0, sizeof(*tree)); if (!nla) return 0; err = nla_parse_nested_deprecated(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL); if (err < 0) goto errout; err = -EINVAL; rt_hdr = tb[TCA_EMATCH_TREE_HDR]; rt_list = tb[TCA_EMATCH_TREE_LIST]; if (rt_hdr == NULL || rt_list == NULL) goto errout; tree_hdr = nla_data(rt_hdr); memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); rt_match = nla_data(rt_list); list_len = nla_len(rt_list); matches_len = tree_hdr->nmatches * sizeof(*em); tree->matches = kzalloc(matches_len, GFP_KERNEL); if (tree->matches == NULL) goto errout; /* We do not use nla_parse_nested here because the maximum * number of attributes is unknown. This saves us the allocation * for a tb buffer which would serve no purpose at all. * * The array of rt attributes is parsed in the order as they are * provided, their type must be incremental from 1 to n. Even * if it does not serve any real purpose, a failure of sticking * to this policy will result in parsing failure. */ for (idx = 0; nla_ok(rt_match, list_len); idx++) { err = -EINVAL; if (rt_match->nla_type != (idx + 1)) goto errout_abort; if (idx >= tree_hdr->nmatches) goto errout_abort; if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr)) goto errout_abort; em = tcf_em_get_match(tree, idx); err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); if (err < 0) goto errout_abort; rt_match = nla_next(rt_match, &list_len); } /* Check if the number of matches provided by userspace actually * complies with the array of matches. The number was used for * the validation of references and a mismatch could lead to * undefined references during the matching process. */ if (idx != tree_hdr->nmatches) { err = -EINVAL; goto errout_abort; } err = 0; errout: return err; errout_abort: tcf_em_tree_destroy(tree); return err; } EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree * * @tree: ematch tree to be deleted * * This functions destroys an ematch tree previously created by * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that * the ematch tree is not in use before calling this function. */ void tcf_em_tree_destroy(struct tcf_ematch_tree *tree) { int i; if (tree->matches == NULL) return; for (i = 0; i < tree->hdr.nmatches; i++) { struct tcf_ematch *em = tcf_em_get_match(tree, i); if (em->ops) { if (em->ops->destroy) em->ops->destroy(em); else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); } } tree->hdr.nmatches = 0; kfree(tree->matches); tree->matches = NULL; } EXPORT_SYMBOL(tcf_em_tree_destroy); /** * tcf_em_tree_dump - dump ematch tree into a rtnl message * * @skb: skb holding the rtnl message * @tree: ematch tree to be dumped * @tlv: TLV type to be used to encapsulate the tree * * This function dumps a ematch tree into a rtnl message. It is valid to * call this function while the ematch tree is in use. * * Returns -1 if the skb tailroom is insufficient. */ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) { int i; u8 *tail; struct nlattr *top_start; struct nlattr *list_start; top_start = nla_nest_start_noflag(skb, tlv); if (top_start == NULL) goto nla_put_failure; if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) goto nla_put_failure; list_start = nla_nest_start_noflag(skb, TCA_EMATCH_TREE_LIST); if (list_start == NULL) goto nla_put_failure; tail = skb_tail_pointer(skb); for (i = 0; i < tree->hdr.nmatches; i++) { struct nlattr *match_start = (struct nlattr *)tail; struct tcf_ematch *em = tcf_em_get_match(tree, i); struct tcf_ematch_hdr em_hdr = { .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, .matchid = em->matchid, .flags = em->flags }; if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) goto nla_put_failure; if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) goto nla_put_failure; } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { u32 u = em->data; nla_put_nohdr(skb, sizeof(u), &u); } else if (em->datalen > 0) nla_put_nohdr(skb, em->datalen, (void *) em->data); tail = skb_tail_pointer(skb); match_start->nla_len = tail - (u8 *)match_start; } nla_nest_end(skb, list_start); nla_nest_end(skb, top_start); return 0; nla_put_failure: return -1; } EXPORT_SYMBOL(tcf_em_tree_dump); static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { int r = em->ops->match(skb, em, info); return tcf_em_is_inverted(em) ? !r : r; } /* Do not use this function directly, use tcf_em_tree_match instead */ int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { int stackp = 0, match_idx = 0, res = 0; struct tcf_ematch *cur_match; int stack[CONFIG_NET_EMATCH_STACK]; proceed: while (match_idx < tree->hdr.nmatches) { cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_container(cur_match)) { if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) goto stack_overflow; stack[stackp++] = match_idx; match_idx = cur_match->data; goto proceed; } res = tcf_em_match(skb, cur_match, info); if (tcf_em_early_end(cur_match, res)) break; match_idx++; } pop_stack: if (stackp > 0) { match_idx = stack[--stackp]; cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_inverted(cur_match)) res = !res; if (tcf_em_early_end(cur_match, res)) { goto pop_stack; } else { match_idx++; goto proceed; } } return res; stack_overflow: net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } EXPORT_SYMBOL(__tcf_em_tree_match);
27 15 3 12 10 2 12 12 33 29 29 221 4 219 219 202 31 29 29 219 4 1 3 25 24 24 1 22 22 1 1 25 241 208 201 201 201 201 207 241 241 5 1 194 195 13 192 192 147 146 144 15 56 55 15 5 15 1 27 236 7 241 240 241 10 10 10 10 10 241 23 23 1 23 23 21 21 21 21 21 23 21 27 22 22 1 208 16 16 16 208 198 208 199 198 208 204 204 204 1 1 125 123 125 24 125 24 122 6 123 123 123 121 7 1 7 6 1 1 1 1 5 122 1 1 50 50 50 50 25 25 50 50 50 5 3 3 3 2 1 251 2 1 251 1 1 251 251 251 251 241 241 241 241 240 10 241 241 223 222 35 27 27 221 12 12 221 240 225 225 225 9 216 216 225 220 174 1 1 173 5 2 1 16 4 14 14 1 13 167 58 173 173 173 1 234 232 2 232 8 8 221 166 232 150 123 1 159 175 176 232 12 231 232 32 216 4 4 4 231 183 1 183 1 182 23 182 231 11 231 1 12 12 2 12 14 8 73 73 73 24 23 23 16 8 8 24 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Novell Inc. * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/fs.h> #include <linux/cred.h> #include <linux/ctype.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/ratelimit.h> #include <linux/mount.h> #include <linux/exportfs.h> #include "overlayfs.h" #include "../internal.h" /* for vfs_path_lookup */ struct ovl_lookup_data { struct super_block *sb; const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; bool xwhiteouts; bool stop; bool last; char *redirect; int metacopy; /* Referring to last redirect xattr */ bool absolute_redirect; }; static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; char *buf; struct ovl_fs *ofs = OVL_FS(d->sb); d->absolute_redirect = false; buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post)); if (IS_ERR_OR_NULL(buf)) return PTR_ERR(buf); if (buf[0] == '/') { d->absolute_redirect = true; /* * One of the ancestor path elements in an absolute path * lookup in ovl_lookup_layer() could have been opaque and * that will stop further lookup in lower layers (d->stop=true) * But we have found an absolute redirect in descendant path * element and that should force continue lookup in lower * layers (reset d->stop). */ d->stop = false; } else { res = strlen(buf) + 1; memmove(buf + prelen, buf, res); memcpy(buf, d->name.name, prelen); } strcat(buf, post); kfree(d->redirect); d->redirect = buf; d->name.name = d->redirect; d->name.len = strlen(d->redirect); return 0; } static int ovl_acceptable(void *ctx, struct dentry *dentry) { /* * A non-dir origin may be disconnected, which is fine, because * we only need it for its unique inode number. */ if (!d_is_dir(dentry)) return 1; /* Don't decode a deleted empty directory */ if (d_unhashed(dentry)) return 0; /* Check if directory belongs to the layer we are decoding from */ return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root); } /* * Check validity of an overlay file handle buffer. * * Return 0 for a valid file handle. * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; } static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox) { int res, err; struct ovl_fh *fh = NULL; res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; goto fail; } /* Zero size value means "copied up but origin unknown" */ if (res == 0) return NULL; fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res); if (res < 0) goto fail; err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; goto invalid; } return fh; out: kfree(fh); return NULL; fail: pr_warn_ratelimited("failed to get origin (%i)\n", res); goto out; invalid: pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh); goto out; } struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, struct vfsmount *mnt, bool connected) { struct dentry *real; int bytes; if (!capable(CAP_DAC_READ_SEARCH)) return NULL; /* * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. * In case of uuid=off option just make sure that stored uuid is null. */ if (ovl_origin_uuid(ofs) ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : !uuid_is_null(&fh->fb.uuid)) return NULL; bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* * Treat stale file handle to lower file as "origin unknown". * upper file handle could become stale when upper file is * unlinked and this information is needed to handle stale * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } if (ovl_dentry_weird(real)) { dput(real); return NULL; } return real; } static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { spin_lock(&ret->d_lock); /* Recheck condition under lock */ if (d_is_negative(ret) && ret->d_lockref.count == 1) __d_drop(ret); spin_unlock(&ret->d_lock); } dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, const char *name, unsigned int namelen, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this; struct path path; int err; bool last_element = !post[0]; bool is_upper = d->layer->idx == 0; char val; this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; if (err == -ENOENT || err == -ENAMETOOLONG) goto out; goto out_err; } if (ovl_dentry_weird(this)) { /* Don't support traversing automounts and other weirdness */ err = -EREMOTE; goto out_err; } path.dentry = this; path.mnt = d->layer->mnt; if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } /* * This dentry should be a regular file if previous layer lookup * found a metacopy dentry. */ if (last_element && d->metacopy && !d_is_reg(this)) { d->stop = true; goto put_and_out; } if (!d_can_lookup(this)) { if (d->is_dir || !last_element) { d->stop = true; goto put_and_out; } err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; d->metacopy = err; d->stop = !d->metacopy; if (!d->metacopy || d->last) goto out; } else { if (ovl_lookup_trap_inode(d->sb, this)) { /* Caught in a trap of overlapping layers */ err = -ELOOP; goto out_err; } if (last_element) d->is_dir = true; if (d->last) goto out; /* overlay.opaque=x means xwhiteouts directory */ val = ovl_get_opaquedir_val(ofs, &path); if (last_element && !is_upper && val == 'x') { d->xwhiteouts = true; ovl_layer_set_xwhiteouts(ofs, d->layer); } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; goto out; } } err = ovl_check_redirect(&path, d, prelen, post); if (err) goto out_err; out: *ret = this; return 0; put_and_out: dput(this); this = NULL; goto out; out_err: dput(this); return err; } static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, struct dentry **ret, bool drop_negative) { /* Counting down from the end, since the prefix can change */ size_t rem = d->name.len - 1; struct dentry *dentry = NULL; int err; if (d->name.name[0] != '/') return ovl_lookup_single(base, d, d->name.name, d->name.len, 0, "", ret, drop_negative); while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { const char *s = d->name.name + d->name.len - rem; const char *next = strchrnul(s, '/'); size_t thislen = next - s; bool end = !next[0]; /* Verify we did not go off the rails */ if (WARN_ON(s[-1] != '/')) return -EIO; err = ovl_lookup_single(base, d, s, thislen, d->name.len - rem, next, &base, drop_negative); dput(dentry); if (err) return err; dentry = base; if (end) break; rem -= thislen + 1; if (WARN_ON(rem >= d->name.len)) return -EIO; } *ret = dentry; return 0; } static int ovl_lookup_data_layer(struct dentry *dentry, const char *redirect, const struct ovl_layer *layer, struct path *datapath) { int err; err = vfs_path_lookup(layer->mnt->mnt_root, layer->mnt, redirect, LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | LOOKUP_NO_XDEV, datapath); pr_debug("lookup lowerdata (%pd2, redirect=\"%s\", layer=%d, err=%i)\n", dentry, redirect, layer->idx, err); if (err) return err; err = -EREMOTE; if (ovl_dentry_weird(datapath->dentry)) goto out_path_put; err = -ENOENT; /* Only regular file is acceptable as lower data */ if (!d_is_reg(datapath->dentry)) goto out_path_put; return 0; out_path_put: path_put(datapath); return err; } /* Lookup in data-only layers by absolute redirect to layer root */ static int ovl_lookup_data_layers(struct dentry *dentry, const char *redirect, struct ovl_path *lowerdata) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); const struct ovl_layer *layer; struct path datapath; int err = -ENOENT; int i; layer = &ofs->layers[ofs->numlayer - ofs->numdatalayer]; for (i = 0; i < ofs->numdatalayer; i++, layer++) { err = ovl_lookup_data_layer(dentry, redirect, layer, &datapath); if (!err) { mntput(datapath.mnt); lowerdata->dentry = datapath.dentry; lowerdata->layer = layer; return 0; } } return err; } int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp) { struct dentry *origin = NULL; int i; for (i = 1; i <= ovl_numlowerlayer(ofs); i++) { /* * If lower fs uuid is not unique among lower fs we cannot match * fh->uuid to layer. */ if (ofs->layers[i].fsid && ofs->layers[i].fs->bad_uuid) continue; origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt, connected); if (origin) break; } if (!origin) return -ESTALE; else if (IS_ERR(origin)) return PTR_ERR(origin); if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) && inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) goto invalid; if (!*stackp) *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL); if (!*stackp) { dput(origin); return -ENOMEM; } **stackp = (struct ovl_path){ .dentry = origin, .layer = &ofs->layers[i] }; return 0; invalid: pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); return -ESTALE; } static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, struct ovl_path **stackp) { struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN); int err; if (IS_ERR_OR_NULL(fh)) return PTR_ERR(fh); err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); kfree(fh); if (err) { if (err == -ESTALE) return 0; return err; } return 0; } /* * Verify that @fh matches the file handle stored in xattr @name. * Return 0 on match, -ESTALE on mismatch, < 0 on error. */ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const struct ovl_fh *fh) { struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox); int err = 0; if (!ofh) return -ENODATA; if (IS_ERR(ofh)) return PTR_ERR(ofh); if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); return err; } int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const struct ovl_fh *fh, bool is_upper, bool set) { int err; err = ovl_verify_fh(ofs, dentry, ox, fh); if (set && err == -ENODATA) err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); return err; } /* * Verify that @real dentry matches the file handle stored in xattr @name. * * If @set is true and there is no stored file handle, encode @real and store * file handle in xattr @name. * * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, struct dentry *real, bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; int err; fh = ovl_encode_real_fh(ofs, real, is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; goto fail; } err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set); if (err) goto fail; out: kfree(fh); return err; fail: inode = d_inode(real); pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; } /* Get upper dentry from index */ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected) { struct ovl_fh *fh; struct dentry *upper; if (!d_is_dir(index)) return dget(index); fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER); if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected); kfree(fh); if (IS_ERR_OR_NULL(upper)) return upper ?: ERR_PTR(-ESTALE); if (!d_is_dir(upper)) { pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n", index, upper); dput(upper); return ERR_PTR(-EIO); } return upper; } /* * Verify that an index entry name matches the origin file handle stored in * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. */ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) { struct ovl_fh *fh = NULL; size_t len; struct ovl_path origin = { }; struct ovl_path *stack = &origin; struct dentry *upper = NULL; int err; if (!d_inode(index)) return 0; err = -EINVAL; if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; /* * Whiteout index entries are used as an indication that an exported * overlay file handle should be treated as stale (i.e. after unlink * of the overlay inode). These entries contain no origin xattr. */ if (ovl_is_whiteout(index)) goto out; /* * Verifying directory index entries are not stale is expensive, so * only verify stale dir index if NFS export is enabled. */ if (d_is_dir(index) && !ofs->config.nfs_export) goto out; /* * Directory index entries should have 'upper' xattr pointing to the * real upper dir. Non-dir index entries are hardlinks to the upper * real inode. For non-dir index, we can read the copy up origin xattr * directly from the index dentry, but for dir index we first need to * decode the upper directory. */ upper = ovl_index_upper(ofs, index, false); if (IS_ERR_OR_NULL(upper)) { err = PTR_ERR(upper); /* * Directory index entries with no 'upper' xattr need to be * removed. When dir index entry has a stale 'upper' xattr, * we assume that upper dir was removed and we treat the dir * index as orphan entry that needs to be whited out. */ if (err == -ESTALE) goto orphan; else if (!err) err = -ESTALE; goto fail; } err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh); dput(upper); if (err) goto fail; /* Check if non-dir index is orphan and don't warn before cleaning it */ if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { err = ovl_check_origin_fh(ofs, fh, false, index, &stack); if (err) goto fail; if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0) goto orphan; } out: dput(origin.dentry); kfree(fh); return err; fail: pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n", index, d_inode(index)->i_mode & S_IFMT, err); goto out; orphan: pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(index)->i_nlink); err = -ENOENT; goto out; } int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name) { char *n, *s; n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; } /* * Lookup in indexdir for the index entry of a lower real inode or a copy up * origin inode. The index entry name is the hex representation of the lower * inode file handle. * * If the index dentry in negative, then either no lower aliases have been * copied up yet, or aliases have been copied up in older kernels and are * not indexed. * * If the index dentry for a copy up origin inode is positive, but points * to an inode different than the upper inode, then either the upper inode * has been copied up and not indexed or it was indexed, but since then * index dir was cleared. Either way, that index cannot be used to identify * the overlay inode. */ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name) { struct ovl_fh *fh; int err; fh = ovl_encode_real_fh(ofs, origin, false); if (IS_ERR(fh)) return PTR_ERR(fh); err = ovl_get_index_name_fh(fh, name); kfree(fh); return err; } /* Lookup index by file handle for NFS export */ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) { struct dentry *index; struct qstr name; int err; err = ovl_get_index_name_fh(fh, &name); if (err) return ERR_PTR(err); index = lookup_positive_unlocked(name.name, ofs->workdir, name.len); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) index = NULL; return index; } if (ovl_is_whiteout(index)) err = -ESTALE; else if (ovl_dentry_weird(index)) err = -EIO; else return index; dput(index); return ERR_PTR(err); } struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify) { struct dentry *index; struct inode *inode; struct qstr name; bool is_dir = d_is_dir(origin); int err; err = ovl_get_index_name(ofs, origin, &name); if (err) return ERR_PTR(err); index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, ofs->workdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { index = NULL; goto out; } pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); goto out; } inode = d_inode(index); if (ovl_is_whiteout(index) && !verify) { /* * When index lookup is called with !verify for decoding an * overlay file handle, a whiteout index implies that decode * should treat file handle as stale and no need to print a * warning about it. */ dput(index); index = ERR_PTR(-ESTALE); goto out; } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || inode_wrong_type(inode, d_inode(origin)->i_mode)) { /* * Index should always be of the same file type as origin * except for the case of a whiteout index. A whiteout * index should only exist if all lower aliases have been * unlinked, which means that finding a lower origin on lookup * whose index is a whiteout should be treated as an error. */ pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); goto fail; } else if (is_dir && verify) { if (!upper) { pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", origin, index); goto fail; } /* Verify that dir index 'upper' xattr points to upper dir */ err = ovl_verify_upper(ofs, index, upper, false); if (err) { if (err == -ESTALE) { pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", upper, origin, index); } goto fail; } } else if (upper && d_inode(upper) != inode) { goto out_dput; } out: kfree(name.name); return index; out_dput: dput(index); index = NULL; goto out; fail: dput(index); index = ERR_PTR(-EIO); goto out; } /* * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path, const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); if (path->dentry) { *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; *layer = lowerstack[idx - 1].layer; path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } /* Fix missing 'origin' xattr */ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, struct dentry *lower, struct dentry *upper) { const struct ovl_fh *fh; int err; if (ovl_check_origin_xattr(ofs, upper)) return 0; fh = ovl_get_origin_fh(ofs, lower); if (IS_ERR(fh)) return PTR_ERR(fh); err = ovl_want_write(dentry); if (err) goto out; err = ovl_set_origin_fh(ofs, fh, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); ovl_drop_write(dentry); out: kfree(fh); return err; } static int ovl_maybe_validate_verity(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct inode *inode = d_inode(dentry); struct path datapath, metapath; int err; if (!ofs->config.verity_mode || !ovl_is_metacopy_dentry(dentry) || ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) return 0; if (!ovl_test_flag(OVL_HAS_DIGEST, inode)) { if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", dentry); return -EIO; } return 0; } ovl_path_lowerdata(dentry, &datapath); if (!datapath.dentry) return -EIO; ovl_path_real(dentry, &metapath); if (!metapath.dentry) return -EIO; err = ovl_inode_lock_interruptible(inode); if (err) return err; if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) { const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); err = ovl_validate_verity(ofs, &metapath, &datapath); if (err == 0) ovl_set_flag(OVL_VERIFIED_DIGEST, inode); revert_creds(old_cred); } ovl_inode_unlock(inode); return err; } /* Lazy lookup of lowerdata */ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) { struct inode *inode = d_inode(dentry); const char *redirect = ovl_lowerdata_redirect(inode); struct ovl_path datapath = {}; const struct cred *old_cred; int err; if (!redirect || ovl_dentry_lowerdata(dentry)) return 0; if (redirect[0] != '/') return -EIO; err = ovl_inode_lock_interruptible(inode); if (err) return err; err = 0; /* Someone got here before us? */ if (ovl_dentry_lowerdata(dentry)) goto out; old_cred = ovl_override_creds(dentry->d_sb); err = ovl_lookup_data_layers(dentry, redirect, &datapath); revert_creds(old_cred); if (err) goto out_err; err = ovl_dentry_set_lowerdata(dentry, &datapath); if (err) goto out_err; out: ovl_inode_unlock(inode); dput(datapath.dentry); return err; out_err: pr_warn_ratelimited("lazy lowerdata lookup failed (%pd2, err=%i)\n", dentry, err); goto out; } int ovl_verify_lowerdata(struct dentry *dentry) { int err; err = ovl_maybe_lookup_lowerdata(dentry); if (err) return err; return ovl_maybe_validate_verity(dentry); } struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe = NULL; const struct cred *old_cred; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); struct ovl_path *stack = NULL, *origin_path = NULL; struct dentry *upperdir, *upperdentry = NULL; struct dentry *origin = NULL; struct dentry *index = NULL; unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; char *upperredirect = NULL; struct dentry *this; unsigned int i; int err; bool uppermetacopy = false; int metacopy_size = 0; struct ovl_lookup_data d = { .sb = dentry->d_sb, .name = dentry->d_name, .is_dir = false, .opaque = false, .stop = false, .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), .redirect = NULL, .metacopy = 0, }; if (dentry->d_name.len > ofs->namelen) return ERR_PTR(-ENAMETOOLONG); old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { d.layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) { dput(upperdentry); err = -EREMOTE; goto out; } if (upperdentry && !d.is_dir) { /* * Lookup copy up origin by decoding origin file handle. * We may get a disconnected dentry, which is fine, * because we only need to hold the origin inode in * cache and use its inode number. We may even get a * connected dentry, that is not under any of the lower * layers root. That is also fine for using it's inode * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ err = ovl_check_origin(ofs, upperdentry, &origin_path); if (err) goto out_put_upper; if (d.metacopy) uppermetacopy = true; metacopy_size = d.metacopy; } if (d.redirect) { err = -ENOMEM; upperredirect = kstrdup(d.redirect, GFP_KERNEL); if (!upperredirect) goto out_put_upper; if (d.redirect[0] == '/') poe = roe; } upperopaque = d.opaque; } if (!d.stop && ovl_numlower(poe)) { err = -ENOMEM; stack = ovl_stack_alloc(ofs->numlayer - 1); if (!stack) goto out_put_upper; } for (i = 0; !d.stop && i < ovl_numlower(poe); i++) { struct ovl_path lower = ovl_lowerstack(poe)[i]; if (!ovl_redirect_follow(ofs)) d.last = i == ovl_numlower(poe) - 1; else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); d.layer = lower.layer; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; if (!this) continue; if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { dput(this); err = -EPERM; pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); goto out_put; } /* * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". */ if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { err = ovl_fix_origin(ofs, dentry, this, upperdentry); if (err) { dput(this); goto out_put; } } /* * When "verify_lower" feature is enabled, do not merge with a * lower dir that does not match a stored origin xattr. In any * case, only verified origin is used for index lookup. * * For non-dir dentry, if index=on, then ensure origin * matches the dentry found using path based lookup, * otherwise error out. */ if (upperdentry && !ctr && ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || (!d.is_dir && ofs->config.index && origin_path))) { err = ovl_verify_origin(ofs, upperdentry, this, false); if (err) { dput(this); if (d.is_dir) break; goto out_put; } origin = this; } if (!upperdentry && !d.is_dir && !ctr && d.metacopy) metacopy_size = d.metacopy; if (d.metacopy && ctr) { /* * Do not store intermediate metacopy dentries in * lower chain, except top most lower metacopy dentry. * Continue the loop so that if there is an absolute * redirect on this dentry, poe can be reset to roe. */ dput(this); this = NULL; } else { stack[ctr].dentry = this; stack[ctr].layer = lower.layer; ctr++; } /* * Following redirects can have security consequences: it's like * a symlink into the lower layer without the permission checks. * This is only a problem if the upper layer is untrusted (e.g * comes from an USB drive). This can allow a non-readable file * or directory to become readable. * * Only following redirects when redirects are enabled disables * this attack vector when not necessary. */ err = -EPERM; if (d.redirect && !ovl_redirect_follow(ofs)) { pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", dentry); goto out_put; } if (d.stop) break; if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ i = lower.layer->idx - 1; } } /* Defer lookup of lowerdata in data-only layers to first access */ if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { d.metacopy = 0; ctr++; } /* * For regular non-metacopy upper dentries, there is no lower * path based lookup, hence ctr will be zero. If a dentry is found * using ORIGIN xattr on upper, install it in stack. * * For metacopy dentry, path based lookup will find lower dentries. * Just make sure a corresponding data dentry has been found. */ if (d.metacopy || (uppermetacopy && !ctr)) { pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", dentry); err = -EIO; goto out_put; } else if (!d.is_dir && upperdentry && !ctr && origin_path) { if (WARN_ON(stack != NULL)) { err = -EIO; goto out_put; } stack = origin_path; ctr = 1; origin = origin_path->dentry; origin_path = NULL; } /* * Always lookup index if there is no-upperdentry. * * For the case of upperdentry, we have set origin by now if it * needed to be set. There are basically three cases. * * For directories, lookup index by lower inode and verify it matches * upper inode. We only trust dir index if we verified that lower dir * matches origin, otherwise dir index entries may be inconsistent * and we ignore them. * * For regular upper, we already set origin if upper had ORIGIN * xattr. There is no verification though as there is no path * based dentry lookup in lower in this case. * * For metacopy upper, we set a verified origin already if index * is enabled and if upper had an ORIGIN xattr. * */ if (!upperdentry && ctr) origin = stack[0].dentry; if (origin && ovl_indexdir(dentry->d_sb) && (!d.is_dir || ovl_index_all(dentry->d_sb))) { index = ovl_lookup_index(ofs, upperdentry, origin, true); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; goto out_put; } } if (ctr) { oe = ovl_alloc_entry(ctr); err = -ENOMEM; if (!oe) goto out_put; ovl_stack_cpy(ovl_lowerstack(oe), stack, ctr); } if (upperopaque) ovl_dentry_set_opaque(dentry); if (d.xwhiteouts) ovl_dentry_set_xwhiteouts(dentry); if (upperdentry) ovl_dentry_set_upper_alias(dentry); else if (index) { struct path upperpath = { .dentry = upperdentry = dget(index), .mnt = ovl_upper_mnt(ofs), }; /* * It's safe to assign upperredirect here: the previous * assignment of happens only if upperdentry is non-NULL, and * this one only if upperdentry is NULL. */ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); upperredirect = NULL; goto out_free_oe; } err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) goto out_free_oe; uppermetacopy = err; metacopy_size = err; } if (upperdentry || ctr) { struct ovl_inode_params oip = { .upperdentry = upperdentry, .oe = oe, .index = index, .redirect = upperredirect, }; /* Store lowerdata redirect for lazy lookup */ if (ctr > 1 && !d.is_dir && !stack[ctr - 1].dentry) { oip.lowerdata_redirect = d.redirect; d.redirect = NULL; } inode = ovl_get_inode(dentry->d_sb, &oip); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; if (upperdentry && !uppermetacopy) ovl_set_flag(OVL_UPPERDATA, inode); if (metacopy_size > OVL_METACOPY_MIN_SIZE) ovl_set_flag(OVL_HAS_DIGEST, inode); } ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); revert_creds(old_cred); if (origin_path) { dput(origin_path->dentry); kfree(origin_path); } dput(index); ovl_stack_free(stack, ctr); kfree(d.redirect); return d_splice_alias(inode, dentry); out_free_oe: ovl_free_entry(oe); out_put: dput(index); ovl_stack_free(stack, ctr); out_put_upper: if (origin_path) { dput(origin_path->dentry); kfree(origin_path); } dput(upperdentry); kfree(upperredirect); out: kfree(d.redirect); revert_creds(old_cred); return ERR_PTR(err); } bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); const struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; bool done = false; /* * If dentry is negative, then lower is positive iff this is a * whiteout. */ if (!dentry->d_inode) return ovl_dentry_is_opaque(dentry); /* Negative upper -> positive lower */ if (!ovl_dentry_upper(dentry)) return true; old_cred = ovl_override_creds(dentry->d_sb); /* Positive upper -> have to look up lower to see whether it exists */ for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { struct dentry *this; struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), name->name, parentpath->dentry, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: case -ENAMETOOLONG: break; default: /* * Assume something is there, we just couldn't * access it. */ positive = true; break; } } else { struct path path = { .dentry = this, .mnt = parentpath->layer->mnt, }; positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); done = true; dput(this); } } revert_creds(old_cred); return positive; }
2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 // SPDX-License-Identifier: GPL-2.0-or-later /* * HackRF driver * * Copyright (C) 2014 Antti Palosaari <crope@iki.fi> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <media/v4l2-device.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/videobuf2-v4l2.h> #include <media/videobuf2-vmalloc.h> /* * Used Avago MGA-81563 RF amplifier could be destroyed pretty easily with too * strong signal or transmitting to bad antenna. * Set RF gain control to 'grabbed' state by default for sure. */ static bool hackrf_enable_rf_gain_ctrl; module_param_named(enable_rf_gain_ctrl, hackrf_enable_rf_gain_ctrl, bool, 0644); MODULE_PARM_DESC(enable_rf_gain_ctrl, "enable RX/TX RF amplifier control (warn: could damage amplifier)"); /* HackRF USB API commands (from HackRF Library) */ enum { CMD_SET_TRANSCEIVER_MODE = 0x01, CMD_SAMPLE_RATE_SET = 0x06, CMD_BASEBAND_FILTER_BANDWIDTH_SET = 0x07, CMD_BOARD_ID_READ = 0x0e, CMD_VERSION_STRING_READ = 0x0f, CMD_SET_FREQ = 0x10, CMD_AMP_ENABLE = 0x11, CMD_SET_LNA_GAIN = 0x13, CMD_SET_VGA_GAIN = 0x14, CMD_SET_TXVGA_GAIN = 0x15, }; /* * bEndpointAddress 0x81 EP 1 IN * Transfer Type Bulk * wMaxPacketSize 0x0200 1x 512 bytes */ #define MAX_BULK_BUFS (6) #define BULK_BUFFER_SIZE (128 * 512) static const struct v4l2_frequency_band bands_adc_dac[] = { { .tuner = 0, .type = V4L2_TUNER_SDR, .index = 0, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 200000, .rangehigh = 24000000, }, }; static const struct v4l2_frequency_band bands_rx_tx[] = { { .tuner = 1, .type = V4L2_TUNER_RF, .index = 0, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 1, .rangehigh = 4294967294LL, /* max u32, hw goes over 7GHz */ }, }; /* stream formats */ struct hackrf_format { u32 pixelformat; u32 buffersize; }; /* format descriptions for capture and preview */ static struct hackrf_format formats[] = { { .pixelformat = V4L2_SDR_FMT_CS8, .buffersize = BULK_BUFFER_SIZE, }, }; static const unsigned int NUM_FORMATS = ARRAY_SIZE(formats); /* intermediate buffers with raw data from the USB device */ struct hackrf_buffer { struct vb2_v4l2_buffer vb; struct list_head list; }; struct hackrf_dev { #define USB_STATE_URB_BUF 1 /* XXX: set manually */ #define RX_ON 4 #define TX_ON 5 #define RX_ADC_FREQUENCY 11 #define TX_DAC_FREQUENCY 12 #define RX_BANDWIDTH 13 #define TX_BANDWIDTH 14 #define RX_RF_FREQUENCY 15 #define TX_RF_FREQUENCY 16 #define RX_RF_GAIN 17 #define TX_RF_GAIN 18 #define RX_IF_GAIN 19 #define RX_LNA_GAIN 20 #define TX_LNA_GAIN 21 unsigned long flags; struct usb_interface *intf; struct device *dev; struct usb_device *udev; struct video_device rx_vdev; struct video_device tx_vdev; struct v4l2_device v4l2_dev; /* videobuf2 queue and queued buffers list */ struct vb2_queue rx_vb2_queue; struct vb2_queue tx_vb2_queue; struct list_head rx_buffer_list; struct list_head tx_buffer_list; spinlock_t buffer_list_lock; /* Protects buffer_list */ unsigned int sequence; /* Buffer sequence counter */ unsigned int vb_full; /* vb is full and packets dropped */ unsigned int vb_empty; /* vb is empty and packets dropped */ /* Note if taking both locks v4l2_lock must always be locked first! */ struct mutex v4l2_lock; /* Protects everything else */ struct mutex vb_queue_lock; /* Protects vb_queue */ struct urb *urb_list[MAX_BULK_BUFS]; int buf_num; unsigned long buf_size; u8 *buf_list[MAX_BULK_BUFS]; dma_addr_t dma_addr[MAX_BULK_BUFS]; int urbs_initialized; int urbs_submitted; /* USB control message buffer */ #define BUF_SIZE 24 u8 buf[BUF_SIZE]; /* Current configuration */ unsigned int f_adc; unsigned int f_dac; unsigned int f_rx; unsigned int f_tx; u32 pixelformat; u32 buffersize; /* Controls */ struct v4l2_ctrl_handler rx_ctrl_handler; struct v4l2_ctrl *rx_bandwidth_auto; struct v4l2_ctrl *rx_bandwidth; struct v4l2_ctrl *rx_rf_gain; struct v4l2_ctrl *rx_lna_gain; struct v4l2_ctrl *rx_if_gain; struct v4l2_ctrl_handler tx_ctrl_handler; struct v4l2_ctrl *tx_bandwidth_auto; struct v4l2_ctrl *tx_bandwidth; struct v4l2_ctrl *tx_rf_gain; struct v4l2_ctrl *tx_lna_gain; /* Sample rate calc */ unsigned long jiffies_next; unsigned int sample; unsigned int sample_measured; }; #define hackrf_dbg_usb_control_msg(_dev, _r, _t, _v, _i, _b, _l) { \ char *_direction; \ if (_t & USB_DIR_IN) \ _direction = "<<<"; \ else \ _direction = ">>>"; \ dev_dbg(_dev, "%02x %02x %02x %02x %02x %02x %02x %02x %s %*ph\n", \ _t, _r, _v & 0xff, _v >> 8, _i & 0xff, \ _i >> 8, _l & 0xff, _l >> 8, _direction, _l, _b); \ } /* execute firmware command */ static int hackrf_ctrl_msg(struct hackrf_dev *dev, u8 request, u16 value, u16 index, u8 *data, u16 size) { int ret; unsigned int pipe; u8 requesttype; switch (request) { case CMD_SET_TRANSCEIVER_MODE: case CMD_SET_FREQ: case CMD_AMP_ENABLE: case CMD_SAMPLE_RATE_SET: case CMD_BASEBAND_FILTER_BANDWIDTH_SET: pipe = usb_sndctrlpipe(dev->udev, 0); requesttype = (USB_TYPE_VENDOR | USB_DIR_OUT); break; case CMD_BOARD_ID_READ: case CMD_VERSION_STRING_READ: case CMD_SET_LNA_GAIN: case CMD_SET_VGA_GAIN: case CMD_SET_TXVGA_GAIN: pipe = usb_rcvctrlpipe(dev->udev, 0); requesttype = (USB_TYPE_VENDOR | USB_DIR_IN); break; default: dev_err(dev->dev, "Unknown command %02x\n", request); ret = -EINVAL; goto err; } /* write request */ if (!(requesttype & USB_DIR_IN)) memcpy(dev->buf, data, size); ret = usb_control_msg(dev->udev, pipe, request, requesttype, value, index, dev->buf, size, 1000); hackrf_dbg_usb_control_msg(dev->dev, request, requesttype, value, index, dev->buf, size); if (ret < 0) { dev_err(dev->dev, "usb_control_msg() failed %d request %02x\n", ret, request); goto err; } /* read request */ if (requesttype & USB_DIR_IN) memcpy(data, dev->buf, size); return 0; err: return ret; } static int hackrf_set_params(struct hackrf_dev *dev) { struct usb_interface *intf = dev->intf; int ret, i; u8 buf[8], u8tmp; unsigned int uitmp, uitmp1, uitmp2; const bool rx = test_bit(RX_ON, &dev->flags); const bool tx = test_bit(TX_ON, &dev->flags); static const struct { u32 freq; } bandwidth_lut[] = { { 1750000}, /* 1.75 MHz */ { 2500000}, /* 2.5 MHz */ { 3500000}, /* 3.5 MHz */ { 5000000}, /* 5 MHz */ { 5500000}, /* 5.5 MHz */ { 6000000}, /* 6 MHz */ { 7000000}, /* 7 MHz */ { 8000000}, /* 8 MHz */ { 9000000}, /* 9 MHz */ {10000000}, /* 10 MHz */ {12000000}, /* 12 MHz */ {14000000}, /* 14 MHz */ {15000000}, /* 15 MHz */ {20000000}, /* 20 MHz */ {24000000}, /* 24 MHz */ {28000000}, /* 28 MHz */ }; if (!rx && !tx) { dev_dbg(&intf->dev, "device is sleeping\n"); return 0; } /* ADC / DAC frequency */ if (rx && test_and_clear_bit(RX_ADC_FREQUENCY, &dev->flags)) { dev_dbg(&intf->dev, "RX ADC frequency=%u Hz\n", dev->f_adc); uitmp1 = dev->f_adc; uitmp2 = 1; set_bit(TX_DAC_FREQUENCY, &dev->flags); } else if (tx && test_and_clear_bit(TX_DAC_FREQUENCY, &dev->flags)) { dev_dbg(&intf->dev, "TX DAC frequency=%u Hz\n", dev->f_dac); uitmp1 = dev->f_dac; uitmp2 = 1; set_bit(RX_ADC_FREQUENCY, &dev->flags); } else { uitmp1 = uitmp2 = 0; } if (uitmp1 || uitmp2) { buf[0] = (uitmp1 >> 0) & 0xff; buf[1] = (uitmp1 >> 8) & 0xff; buf[2] = (uitmp1 >> 16) & 0xff; buf[3] = (uitmp1 >> 24) & 0xff; buf[4] = (uitmp2 >> 0) & 0xff; buf[5] = (uitmp2 >> 8) & 0xff; buf[6] = (uitmp2 >> 16) & 0xff; buf[7] = (uitmp2 >> 24) & 0xff; ret = hackrf_ctrl_msg(dev, CMD_SAMPLE_RATE_SET, 0, 0, buf, 8); if (ret) goto err; } /* bandwidth */ if (rx && test_and_clear_bit(RX_BANDWIDTH, &dev->flags)) { if (dev->rx_bandwidth_auto->val == true) uitmp = dev->f_adc; else uitmp = dev->rx_bandwidth->val; for (i = 0; i < ARRAY_SIZE(bandwidth_lut); i++) { if (uitmp <= bandwidth_lut[i].freq) { uitmp = bandwidth_lut[i].freq; break; } } dev->rx_bandwidth->val = uitmp; dev->rx_bandwidth->cur.val = uitmp; dev_dbg(&intf->dev, "RX bandwidth selected=%u\n", uitmp); set_bit(TX_BANDWIDTH, &dev->flags); } else if (tx && test_and_clear_bit(TX_BANDWIDTH, &dev->flags)) { if (dev->tx_bandwidth_auto->val == true) uitmp = dev->f_dac; else uitmp = dev->tx_bandwidth->val; for (i = 0; i < ARRAY_SIZE(bandwidth_lut); i++) { if (uitmp <= bandwidth_lut[i].freq) { uitmp = bandwidth_lut[i].freq; break; } } dev->tx_bandwidth->val = uitmp; dev->tx_bandwidth->cur.val = uitmp; dev_dbg(&intf->dev, "TX bandwidth selected=%u\n", uitmp); set_bit(RX_BANDWIDTH, &dev->flags); } else { uitmp = 0; } if (uitmp) { uitmp1 = uitmp2 = 0; uitmp1 |= ((uitmp >> 0) & 0xff) << 0; uitmp1 |= ((uitmp >> 8) & 0xff) << 8; uitmp2 |= ((uitmp >> 16) & 0xff) << 0; uitmp2 |= ((uitmp >> 24) & 0xff) << 8; ret = hackrf_ctrl_msg(dev, CMD_BASEBAND_FILTER_BANDWIDTH_SET, uitmp1, uitmp2, NULL, 0); if (ret) goto err; } /* RX / TX RF frequency */ if (rx && test_and_clear_bit(RX_RF_FREQUENCY, &dev->flags)) { dev_dbg(&intf->dev, "RX RF frequency=%u Hz\n", dev->f_rx); uitmp1 = dev->f_rx / 1000000; uitmp2 = dev->f_rx % 1000000; set_bit(TX_RF_FREQUENCY, &dev->flags); } else if (tx && test_and_clear_bit(TX_RF_FREQUENCY, &dev->flags)) { dev_dbg(&intf->dev, "TX RF frequency=%u Hz\n", dev->f_tx); uitmp1 = dev->f_tx / 1000000; uitmp2 = dev->f_tx % 1000000; set_bit(RX_RF_FREQUENCY, &dev->flags); } else { uitmp1 = uitmp2 = 0; } if (uitmp1 || uitmp2) { buf[0] = (uitmp1 >> 0) & 0xff; buf[1] = (uitmp1 >> 8) & 0xff; buf[2] = (uitmp1 >> 16) & 0xff; buf[3] = (uitmp1 >> 24) & 0xff; buf[4] = (uitmp2 >> 0) & 0xff; buf[5] = (uitmp2 >> 8) & 0xff; buf[6] = (uitmp2 >> 16) & 0xff; buf[7] = (uitmp2 >> 24) & 0xff; ret = hackrf_ctrl_msg(dev, CMD_SET_FREQ, 0, 0, buf, 8); if (ret) goto err; } /* RX RF gain */ if (rx && test_and_clear_bit(RX_RF_GAIN, &dev->flags)) { dev_dbg(&intf->dev, "RX RF gain val=%d->%d\n", dev->rx_rf_gain->cur.val, dev->rx_rf_gain->val); u8tmp = (dev->rx_rf_gain->val) ? 1 : 0; ret = hackrf_ctrl_msg(dev, CMD_AMP_ENABLE, u8tmp, 0, NULL, 0); if (ret) goto err; set_bit(TX_RF_GAIN, &dev->flags); } /* TX RF gain */ if (tx && test_and_clear_bit(TX_RF_GAIN, &dev->flags)) { dev_dbg(&intf->dev, "TX RF gain val=%d->%d\n", dev->tx_rf_gain->cur.val, dev->tx_rf_gain->val); u8tmp = (dev->tx_rf_gain->val) ? 1 : 0; ret = hackrf_ctrl_msg(dev, CMD_AMP_ENABLE, u8tmp, 0, NULL, 0); if (ret) goto err; set_bit(RX_RF_GAIN, &dev->flags); } /* RX LNA gain */ if (rx && test_and_clear_bit(RX_LNA_GAIN, &dev->flags)) { dev_dbg(dev->dev, "RX LNA gain val=%d->%d\n", dev->rx_lna_gain->cur.val, dev->rx_lna_gain->val); ret = hackrf_ctrl_msg(dev, CMD_SET_LNA_GAIN, 0, dev->rx_lna_gain->val, &u8tmp, 1); if (ret) goto err; } /* RX IF gain */ if (rx && test_and_clear_bit(RX_IF_GAIN, &dev->flags)) { dev_dbg(&intf->dev, "IF gain val=%d->%d\n", dev->rx_if_gain->cur.val, dev->rx_if_gain->val); ret = hackrf_ctrl_msg(dev, CMD_SET_VGA_GAIN, 0, dev->rx_if_gain->val, &u8tmp, 1); if (ret) goto err; } /* TX LNA gain */ if (tx && test_and_clear_bit(TX_LNA_GAIN, &dev->flags)) { dev_dbg(&intf->dev, "TX LNA gain val=%d->%d\n", dev->tx_lna_gain->cur.val, dev->tx_lna_gain->val); ret = hackrf_ctrl_msg(dev, CMD_SET_TXVGA_GAIN, 0, dev->tx_lna_gain->val, &u8tmp, 1); if (ret) goto err; } return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } /* Private functions */ static struct hackrf_buffer *hackrf_get_next_buffer(struct hackrf_dev *dev, struct list_head *buffer_list) { unsigned long flags; struct hackrf_buffer *buffer = NULL; spin_lock_irqsave(&dev->buffer_list_lock, flags); if (list_empty(buffer_list)) goto leave; buffer = list_entry(buffer_list->next, struct hackrf_buffer, list); list_del(&buffer->list); leave: spin_unlock_irqrestore(&dev->buffer_list_lock, flags); return buffer; } static void hackrf_copy_stream(struct hackrf_dev *dev, void *dst, void *src, unsigned int src_len) { memcpy(dst, src, src_len); /* calculate sample rate and output it in 10 seconds intervals */ if (unlikely(time_is_before_jiffies(dev->jiffies_next))) { #define MSECS 10000UL unsigned int msecs = jiffies_to_msecs(jiffies - dev->jiffies_next + msecs_to_jiffies(MSECS)); unsigned int samples = dev->sample - dev->sample_measured; dev->jiffies_next = jiffies + msecs_to_jiffies(MSECS); dev->sample_measured = dev->sample; dev_dbg(dev->dev, "slen=%u samples=%u msecs=%u sample rate=%lu\n", src_len, samples, msecs, samples * 1000UL / msecs); } /* total number of samples */ dev->sample += src_len / 2; } /* * This gets called for the bulk stream pipe. This is done in interrupt * time, so it has to be fast, not crash, and not stall. Neat. */ static void hackrf_urb_complete_in(struct urb *urb) { struct hackrf_dev *dev = urb->context; struct usb_interface *intf = dev->intf; struct hackrf_buffer *buffer; unsigned int len; dev_dbg_ratelimited(&intf->dev, "status=%d length=%u/%u\n", urb->status, urb->actual_length, urb->transfer_buffer_length); switch (urb->status) { case 0: /* success */ case -ETIMEDOUT: /* NAK */ break; case -ECONNRESET: /* kill */ case -ENOENT: case -ESHUTDOWN: return; default: /* error */ dev_err_ratelimited(&intf->dev, "URB failed %d\n", urb->status); goto exit_usb_submit_urb; } /* get buffer to write */ buffer = hackrf_get_next_buffer(dev, &dev->rx_buffer_list); if (unlikely(buffer == NULL)) { dev->vb_full++; dev_notice_ratelimited(&intf->dev, "buffer is full - %u packets dropped\n", dev->vb_full); goto exit_usb_submit_urb; } len = min_t(unsigned long, vb2_plane_size(&buffer->vb.vb2_buf, 0), urb->actual_length); hackrf_copy_stream(dev, vb2_plane_vaddr(&buffer->vb.vb2_buf, 0), urb->transfer_buffer, len); vb2_set_plane_payload(&buffer->vb.vb2_buf, 0, len); buffer->vb.sequence = dev->sequence++; buffer->vb.vb2_buf.timestamp = ktime_get_ns(); vb2_buffer_done(&buffer->vb.vb2_buf, VB2_BUF_STATE_DONE); exit_usb_submit_urb: usb_submit_urb(urb, GFP_ATOMIC); } static void hackrf_urb_complete_out(struct urb *urb) { struct hackrf_dev *dev = urb->context; struct usb_interface *intf = dev->intf; struct hackrf_buffer *buffer; unsigned int len; dev_dbg_ratelimited(&intf->dev, "status=%d length=%u/%u\n", urb->status, urb->actual_length, urb->transfer_buffer_length); switch (urb->status) { case 0: /* success */ case -ETIMEDOUT: /* NAK */ break; case -ECONNRESET: /* kill */ case -ENOENT: case -ESHUTDOWN: return; default: /* error */ dev_err_ratelimited(&intf->dev, "URB failed %d\n", urb->status); } /* get buffer to read */ buffer = hackrf_get_next_buffer(dev, &dev->tx_buffer_list); if (unlikely(buffer == NULL)) { dev->vb_empty++; dev_notice_ratelimited(&intf->dev, "buffer is empty - %u packets dropped\n", dev->vb_empty); urb->actual_length = 0; goto exit_usb_submit_urb; } len = min_t(unsigned long, urb->transfer_buffer_length, vb2_get_plane_payload(&buffer->vb.vb2_buf, 0)); hackrf_copy_stream(dev, urb->transfer_buffer, vb2_plane_vaddr(&buffer->vb.vb2_buf, 0), len); urb->actual_length = len; buffer->vb.sequence = dev->sequence++; buffer->vb.vb2_buf.timestamp = ktime_get_ns(); vb2_buffer_done(&buffer->vb.vb2_buf, VB2_BUF_STATE_DONE); exit_usb_submit_urb: usb_submit_urb(urb, GFP_ATOMIC); } static int hackrf_kill_urbs(struct hackrf_dev *dev) { int i; for (i = dev->urbs_submitted - 1; i >= 0; i--) { dev_dbg(dev->dev, "kill urb=%d\n", i); /* stop the URB */ usb_kill_urb(dev->urb_list[i]); } dev->urbs_submitted = 0; return 0; } static int hackrf_submit_urbs(struct hackrf_dev *dev) { int i, ret; for (i = 0; i < dev->urbs_initialized; i++) { dev_dbg(dev->dev, "submit urb=%d\n", i); ret = usb_submit_urb(dev->urb_list[i], GFP_KERNEL); if (ret) { dev_err(dev->dev, "Could not submit URB no. %d - get them all back\n", i); hackrf_kill_urbs(dev); return ret; } dev->urbs_submitted++; } return 0; } static int hackrf_free_stream_bufs(struct hackrf_dev *dev) { if (dev->flags & USB_STATE_URB_BUF) { while (dev->buf_num) { dev->buf_num--; dev_dbg(dev->dev, "free buf=%d\n", dev->buf_num); usb_free_coherent(dev->udev, dev->buf_size, dev->buf_list[dev->buf_num], dev->dma_addr[dev->buf_num]); } } dev->flags &= ~USB_STATE_URB_BUF; return 0; } static int hackrf_alloc_stream_bufs(struct hackrf_dev *dev) { dev->buf_num = 0; dev->buf_size = BULK_BUFFER_SIZE; dev_dbg(dev->dev, "all in all I will use %u bytes for streaming\n", MAX_BULK_BUFS * BULK_BUFFER_SIZE); for (dev->buf_num = 0; dev->buf_num < MAX_BULK_BUFS; dev->buf_num++) { dev->buf_list[dev->buf_num] = usb_alloc_coherent(dev->udev, BULK_BUFFER_SIZE, GFP_KERNEL, &dev->dma_addr[dev->buf_num]); if (!dev->buf_list[dev->buf_num]) { dev_dbg(dev->dev, "alloc buf=%d failed\n", dev->buf_num); hackrf_free_stream_bufs(dev); return -ENOMEM; } dev_dbg(dev->dev, "alloc buf=%d %p (dma %llu)\n", dev->buf_num, dev->buf_list[dev->buf_num], (long long)dev->dma_addr[dev->buf_num]); dev->flags |= USB_STATE_URB_BUF; } return 0; } static int hackrf_free_urbs(struct hackrf_dev *dev) { int i; hackrf_kill_urbs(dev); for (i = dev->urbs_initialized - 1; i >= 0; i--) { if (dev->urb_list[i]) { dev_dbg(dev->dev, "free urb=%d\n", i); /* free the URBs */ usb_free_urb(dev->urb_list[i]); } } dev->urbs_initialized = 0; return 0; } static int hackrf_alloc_urbs(struct hackrf_dev *dev, bool rcv) { int i, j; unsigned int pipe; usb_complete_t complete; if (rcv) { pipe = usb_rcvbulkpipe(dev->udev, 0x81); complete = &hackrf_urb_complete_in; } else { pipe = usb_sndbulkpipe(dev->udev, 0x02); complete = &hackrf_urb_complete_out; } /* allocate the URBs */ for (i = 0; i < MAX_BULK_BUFS; i++) { dev_dbg(dev->dev, "alloc urb=%d\n", i); dev->urb_list[i] = usb_alloc_urb(0, GFP_KERNEL); if (!dev->urb_list[i]) { for (j = 0; j < i; j++) usb_free_urb(dev->urb_list[j]); return -ENOMEM; } usb_fill_bulk_urb(dev->urb_list[i], dev->udev, pipe, dev->buf_list[i], BULK_BUFFER_SIZE, complete, dev); dev->urb_list[i]->transfer_flags = URB_NO_TRANSFER_DMA_MAP; dev->urb_list[i]->transfer_dma = dev->dma_addr[i]; dev->urbs_initialized++; } return 0; } /* The user yanked out the cable... */ static void hackrf_disconnect(struct usb_interface *intf) { struct v4l2_device *v = usb_get_intfdata(intf); struct hackrf_dev *dev = container_of(v, struct hackrf_dev, v4l2_dev); dev_dbg(dev->dev, "\n"); mutex_lock(&dev->vb_queue_lock); mutex_lock(&dev->v4l2_lock); /* No need to keep the urbs around after disconnection */ dev->udev = NULL; v4l2_device_disconnect(&dev->v4l2_dev); video_unregister_device(&dev->tx_vdev); video_unregister_device(&dev->rx_vdev); mutex_unlock(&dev->v4l2_lock); mutex_unlock(&dev->vb_queue_lock); v4l2_device_put(&dev->v4l2_dev); } /* Videobuf2 operations */ static void hackrf_return_all_buffers(struct vb2_queue *vq, enum vb2_buffer_state state) { struct hackrf_dev *dev = vb2_get_drv_priv(vq); struct usb_interface *intf = dev->intf; struct hackrf_buffer *buffer, *node; struct list_head *buffer_list; unsigned long flags; dev_dbg(&intf->dev, "\n"); if (vq->type == V4L2_BUF_TYPE_SDR_CAPTURE) buffer_list = &dev->rx_buffer_list; else buffer_list = &dev->tx_buffer_list; spin_lock_irqsave(&dev->buffer_list_lock, flags); list_for_each_entry_safe(buffer, node, buffer_list, list) { dev_dbg(&intf->dev, "list_for_each_entry_safe\n"); vb2_buffer_done(&buffer->vb.vb2_buf, state); list_del(&buffer->list); } spin_unlock_irqrestore(&dev->buffer_list_lock, flags); } static int hackrf_queue_setup(struct vb2_queue *vq, unsigned int *nbuffers, unsigned int *nplanes, unsigned int sizes[], struct device *alloc_devs[]) { struct hackrf_dev *dev = vb2_get_drv_priv(vq); unsigned int q_num_bufs = vb2_get_num_buffers(vq); dev_dbg(dev->dev, "nbuffers=%d\n", *nbuffers); /* Need at least 8 buffers */ if (q_num_bufs + *nbuffers < 8) *nbuffers = 8 - q_num_bufs; *nplanes = 1; sizes[0] = PAGE_ALIGN(dev->buffersize); dev_dbg(dev->dev, "nbuffers=%d sizes[0]=%d\n", *nbuffers, sizes[0]); return 0; } static void hackrf_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vb2_queue *vq = vb->vb2_queue; struct hackrf_dev *dev = vb2_get_drv_priv(vq); struct hackrf_buffer *buffer = container_of(vbuf, struct hackrf_buffer, vb); struct list_head *buffer_list; unsigned long flags; dev_dbg_ratelimited(&dev->intf->dev, "\n"); if (vq->type == V4L2_BUF_TYPE_SDR_CAPTURE) buffer_list = &dev->rx_buffer_list; else buffer_list = &dev->tx_buffer_list; spin_lock_irqsave(&dev->buffer_list_lock, flags); list_add_tail(&buffer->list, buffer_list); spin_unlock_irqrestore(&dev->buffer_list_lock, flags); } static int hackrf_start_streaming(struct vb2_queue *vq, unsigned int count) { struct hackrf_dev *dev = vb2_get_drv_priv(vq); struct usb_interface *intf = dev->intf; int ret; unsigned int mode; dev_dbg(&intf->dev, "count=%i\n", count); mutex_lock(&dev->v4l2_lock); /* Allow only RX or TX, not both same time */ if (vq->type == V4L2_BUF_TYPE_SDR_CAPTURE) { if (test_bit(TX_ON, &dev->flags)) { ret = -EBUSY; goto err_hackrf_return_all_buffers; } mode = 1; set_bit(RX_ON, &dev->flags); } else { if (test_bit(RX_ON, &dev->flags)) { ret = -EBUSY; goto err_hackrf_return_all_buffers; } mode = 2; set_bit(TX_ON, &dev->flags); } dev->sequence = 0; ret = hackrf_alloc_stream_bufs(dev); if (ret) goto err; ret = hackrf_alloc_urbs(dev, (mode == 1)); if (ret) goto err; ret = hackrf_submit_urbs(dev); if (ret) goto err; ret = hackrf_set_params(dev); if (ret) goto err; /* start hardware streaming */ ret = hackrf_ctrl_msg(dev, CMD_SET_TRANSCEIVER_MODE, mode, 0, NULL, 0); if (ret) goto err; mutex_unlock(&dev->v4l2_lock); return 0; err: hackrf_kill_urbs(dev); hackrf_free_urbs(dev); hackrf_free_stream_bufs(dev); clear_bit(RX_ON, &dev->flags); clear_bit(TX_ON, &dev->flags); err_hackrf_return_all_buffers: hackrf_return_all_buffers(vq, VB2_BUF_STATE_QUEUED); mutex_unlock(&dev->v4l2_lock); dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static void hackrf_stop_streaming(struct vb2_queue *vq) { struct hackrf_dev *dev = vb2_get_drv_priv(vq); struct usb_interface *intf = dev->intf; dev_dbg(&intf->dev, "\n"); mutex_lock(&dev->v4l2_lock); /* stop hardware streaming */ hackrf_ctrl_msg(dev, CMD_SET_TRANSCEIVER_MODE, 0, 0, NULL, 0); hackrf_kill_urbs(dev); hackrf_free_urbs(dev); hackrf_free_stream_bufs(dev); hackrf_return_all_buffers(vq, VB2_BUF_STATE_ERROR); if (vq->type == V4L2_BUF_TYPE_SDR_CAPTURE) clear_bit(RX_ON, &dev->flags); else clear_bit(TX_ON, &dev->flags); mutex_unlock(&dev->v4l2_lock); } static const struct vb2_ops hackrf_vb2_ops = { .queue_setup = hackrf_queue_setup, .buf_queue = hackrf_buf_queue, .start_streaming = hackrf_start_streaming, .stop_streaming = hackrf_stop_streaming, .wait_prepare = vb2_ops_wait_prepare, .wait_finish = vb2_ops_wait_finish, }; static int hackrf_querycap(struct file *file, void *fh, struct v4l2_capability *cap) { struct hackrf_dev *dev = video_drvdata(file); struct usb_interface *intf = dev->intf; dev_dbg(&intf->dev, "\n"); cap->capabilities = V4L2_CAP_SDR_CAPTURE | V4L2_CAP_TUNER | V4L2_CAP_SDR_OUTPUT | V4L2_CAP_MODULATOR | V4L2_CAP_STREAMING | V4L2_CAP_READWRITE | V4L2_CAP_DEVICE_CAPS; strscpy(cap->driver, KBUILD_MODNAME, sizeof(cap->driver)); strscpy(cap->card, dev->rx_vdev.name, sizeof(cap->card)); usb_make_path(dev->udev, cap->bus_info, sizeof(cap->bus_info)); return 0; } static int hackrf_s_fmt_sdr(struct file *file, void *priv, struct v4l2_format *f) { struct hackrf_dev *dev = video_drvdata(file); struct video_device *vdev = video_devdata(file); struct vb2_queue *q; int i; dev_dbg(dev->dev, "pixelformat fourcc %4.4s\n", (char *)&f->fmt.sdr.pixelformat); if (vdev->vfl_dir == VFL_DIR_RX) q = &dev->rx_vb2_queue; else q = &dev->tx_vb2_queue; if (vb2_is_busy(q)) return -EBUSY; for (i = 0; i < NUM_FORMATS; i++) { if (f->fmt.sdr.pixelformat == formats[i].pixelformat) { dev->pixelformat = formats[i].pixelformat; dev->buffersize = formats[i].buffersize; f->fmt.sdr.buffersize = formats[i].buffersize; return 0; } } dev->pixelformat = formats[0].pixelformat; dev->buffersize = formats[0].buffersize; f->fmt.sdr.pixelformat = formats[0].pixelformat; f->fmt.sdr.buffersize = formats[0].buffersize; return 0; } static int hackrf_g_fmt_sdr(struct file *file, void *priv, struct v4l2_format *f) { struct hackrf_dev *dev = video_drvdata(file); dev_dbg(dev->dev, "pixelformat fourcc %4.4s\n", (char *)&dev->pixelformat); f->fmt.sdr.pixelformat = dev->pixelformat; f->fmt.sdr.buffersize = dev->buffersize; return 0; } static int hackrf_try_fmt_sdr(struct file *file, void *priv, struct v4l2_format *f) { struct hackrf_dev *dev = video_drvdata(file); int i; dev_dbg(dev->dev, "pixelformat fourcc %4.4s\n", (char *)&f->fmt.sdr.pixelformat); for (i = 0; i < NUM_FORMATS; i++) { if (formats[i].pixelformat == f->fmt.sdr.pixelformat) { f->fmt.sdr.buffersize = formats[i].buffersize; return 0; } } f->fmt.sdr.pixelformat = formats[0].pixelformat; f->fmt.sdr.buffersize = formats[0].buffersize; return 0; } static int hackrf_enum_fmt_sdr(struct file *file, void *priv, struct v4l2_fmtdesc *f) { struct hackrf_dev *dev = video_drvdata(file); dev_dbg(dev->dev, "index=%d\n", f->index); if (f->index >= NUM_FORMATS) return -EINVAL; f->pixelformat = formats[f->index].pixelformat; return 0; } static int hackrf_s_tuner(struct file *file, void *priv, const struct v4l2_tuner *v) { struct hackrf_dev *dev = video_drvdata(file); int ret; dev_dbg(dev->dev, "index=%d\n", v->index); if (v->index == 0) ret = 0; else if (v->index == 1) ret = 0; else ret = -EINVAL; return ret; } static int hackrf_g_tuner(struct file *file, void *priv, struct v4l2_tuner *v) { struct hackrf_dev *dev = video_drvdata(file); int ret; dev_dbg(dev->dev, "index=%d\n", v->index); if (v->index == 0) { strscpy(v->name, "HackRF ADC", sizeof(v->name)); v->type = V4L2_TUNER_SDR; v->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; v->rangelow = bands_adc_dac[0].rangelow; v->rangehigh = bands_adc_dac[0].rangehigh; ret = 0; } else if (v->index == 1) { strscpy(v->name, "HackRF RF", sizeof(v->name)); v->type = V4L2_TUNER_RF; v->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; v->rangelow = bands_rx_tx[0].rangelow; v->rangehigh = bands_rx_tx[0].rangehigh; ret = 0; } else { ret = -EINVAL; } return ret; } static int hackrf_s_modulator(struct file *file, void *fh, const struct v4l2_modulator *a) { struct hackrf_dev *dev = video_drvdata(file); dev_dbg(dev->dev, "index=%d\n", a->index); return a->index > 1 ? -EINVAL : 0; } static int hackrf_g_modulator(struct file *file, void *fh, struct v4l2_modulator *a) { struct hackrf_dev *dev = video_drvdata(file); int ret; dev_dbg(dev->dev, "index=%d\n", a->index); if (a->index == 0) { strscpy(a->name, "HackRF DAC", sizeof(a->name)); a->type = V4L2_TUNER_SDR; a->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; a->rangelow = bands_adc_dac[0].rangelow; a->rangehigh = bands_adc_dac[0].rangehigh; ret = 0; } else if (a->index == 1) { strscpy(a->name, "HackRF RF", sizeof(a->name)); a->type = V4L2_TUNER_RF; a->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; a->rangelow = bands_rx_tx[0].rangelow; a->rangehigh = bands_rx_tx[0].rangehigh; ret = 0; } else { ret = -EINVAL; } return ret; } static int hackrf_s_frequency(struct file *file, void *priv, const struct v4l2_frequency *f) { struct hackrf_dev *dev = video_drvdata(file); struct usb_interface *intf = dev->intf; struct video_device *vdev = video_devdata(file); int ret; unsigned int uitmp; dev_dbg(&intf->dev, "tuner=%d type=%d frequency=%u\n", f->tuner, f->type, f->frequency); if (f->tuner == 0) { uitmp = clamp(f->frequency, bands_adc_dac[0].rangelow, bands_adc_dac[0].rangehigh); if (vdev->vfl_dir == VFL_DIR_RX) { dev->f_adc = uitmp; set_bit(RX_ADC_FREQUENCY, &dev->flags); } else { dev->f_dac = uitmp; set_bit(TX_DAC_FREQUENCY, &dev->flags); } } else if (f->tuner == 1) { uitmp = clamp(f->frequency, bands_rx_tx[0].rangelow, bands_rx_tx[0].rangehigh); if (vdev->vfl_dir == VFL_DIR_RX) { dev->f_rx = uitmp; set_bit(RX_RF_FREQUENCY, &dev->flags); } else { dev->f_tx = uitmp; set_bit(TX_RF_FREQUENCY, &dev->flags); } } else { ret = -EINVAL; goto err; } ret = hackrf_set_params(dev); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static int hackrf_g_frequency(struct file *file, void *priv, struct v4l2_frequency *f) { struct hackrf_dev *dev = video_drvdata(file); struct usb_interface *intf = dev->intf; struct video_device *vdev = video_devdata(file); int ret; dev_dbg(dev->dev, "tuner=%d type=%d\n", f->tuner, f->type); if (f->tuner == 0) { f->type = V4L2_TUNER_SDR; if (vdev->vfl_dir == VFL_DIR_RX) f->frequency = dev->f_adc; else f->frequency = dev->f_dac; } else if (f->tuner == 1) { f->type = V4L2_TUNER_RF; if (vdev->vfl_dir == VFL_DIR_RX) f->frequency = dev->f_rx; else f->frequency = dev->f_tx; } else { ret = -EINVAL; goto err; } return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static int hackrf_enum_freq_bands(struct file *file, void *priv, struct v4l2_frequency_band *band) { struct hackrf_dev *dev = video_drvdata(file); int ret; dev_dbg(dev->dev, "tuner=%d type=%d index=%d\n", band->tuner, band->type, band->index); if (band->tuner == 0) { if (band->index >= ARRAY_SIZE(bands_adc_dac)) { ret = -EINVAL; } else { *band = bands_adc_dac[band->index]; ret = 0; } } else if (band->tuner == 1) { if (band->index >= ARRAY_SIZE(bands_rx_tx)) { ret = -EINVAL; } else { *band = bands_rx_tx[band->index]; ret = 0; } } else { ret = -EINVAL; } return ret; } static const struct v4l2_ioctl_ops hackrf_ioctl_ops = { .vidioc_querycap = hackrf_querycap, .vidioc_s_fmt_sdr_cap = hackrf_s_fmt_sdr, .vidioc_g_fmt_sdr_cap = hackrf_g_fmt_sdr, .vidioc_enum_fmt_sdr_cap = hackrf_enum_fmt_sdr, .vidioc_try_fmt_sdr_cap = hackrf_try_fmt_sdr, .vidioc_s_fmt_sdr_out = hackrf_s_fmt_sdr, .vidioc_g_fmt_sdr_out = hackrf_g_fmt_sdr, .vidioc_enum_fmt_sdr_out = hackrf_enum_fmt_sdr, .vidioc_try_fmt_sdr_out = hackrf_try_fmt_sdr, .vidioc_reqbufs = vb2_ioctl_reqbufs, .vidioc_create_bufs = vb2_ioctl_create_bufs, .vidioc_prepare_buf = vb2_ioctl_prepare_buf, .vidioc_querybuf = vb2_ioctl_querybuf, .vidioc_qbuf = vb2_ioctl_qbuf, .vidioc_dqbuf = vb2_ioctl_dqbuf, .vidioc_expbuf = vb2_ioctl_expbuf, .vidioc_streamon = vb2_ioctl_streamon, .vidioc_streamoff = vb2_ioctl_streamoff, .vidioc_s_tuner = hackrf_s_tuner, .vidioc_g_tuner = hackrf_g_tuner, .vidioc_s_modulator = hackrf_s_modulator, .vidioc_g_modulator = hackrf_g_modulator, .vidioc_s_frequency = hackrf_s_frequency, .vidioc_g_frequency = hackrf_g_frequency, .vidioc_enum_freq_bands = hackrf_enum_freq_bands, .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, .vidioc_unsubscribe_event = v4l2_event_unsubscribe, .vidioc_log_status = v4l2_ctrl_log_status, }; static const struct v4l2_file_operations hackrf_fops = { .owner = THIS_MODULE, .open = v4l2_fh_open, .release = vb2_fop_release, .read = vb2_fop_read, .write = vb2_fop_write, .poll = vb2_fop_poll, .mmap = vb2_fop_mmap, .unlocked_ioctl = video_ioctl2, }; static const struct video_device hackrf_template = { .name = "HackRF One", .release = video_device_release_empty, .fops = &hackrf_fops, .ioctl_ops = &hackrf_ioctl_ops, }; static void hackrf_video_release(struct v4l2_device *v) { struct hackrf_dev *dev = container_of(v, struct hackrf_dev, v4l2_dev); dev_dbg(dev->dev, "\n"); v4l2_ctrl_handler_free(&dev->rx_ctrl_handler); v4l2_ctrl_handler_free(&dev->tx_ctrl_handler); v4l2_device_unregister(&dev->v4l2_dev); kfree(dev); } static int hackrf_s_ctrl_rx(struct v4l2_ctrl *ctrl) { struct hackrf_dev *dev = container_of(ctrl->handler, struct hackrf_dev, rx_ctrl_handler); struct usb_interface *intf = dev->intf; int ret; switch (ctrl->id) { case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: case V4L2_CID_RF_TUNER_BANDWIDTH: set_bit(RX_BANDWIDTH, &dev->flags); break; case V4L2_CID_RF_TUNER_RF_GAIN: set_bit(RX_RF_GAIN, &dev->flags); break; case V4L2_CID_RF_TUNER_LNA_GAIN: set_bit(RX_LNA_GAIN, &dev->flags); break; case V4L2_CID_RF_TUNER_IF_GAIN: set_bit(RX_IF_GAIN, &dev->flags); break; default: dev_dbg(&intf->dev, "unknown ctrl: id=%d name=%s\n", ctrl->id, ctrl->name); ret = -EINVAL; goto err; } ret = hackrf_set_params(dev); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static int hackrf_s_ctrl_tx(struct v4l2_ctrl *ctrl) { struct hackrf_dev *dev = container_of(ctrl->handler, struct hackrf_dev, tx_ctrl_handler); struct usb_interface *intf = dev->intf; int ret; switch (ctrl->id) { case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: case V4L2_CID_RF_TUNER_BANDWIDTH: set_bit(TX_BANDWIDTH, &dev->flags); break; case V4L2_CID_RF_TUNER_LNA_GAIN: set_bit(TX_LNA_GAIN, &dev->flags); break; case V4L2_CID_RF_TUNER_RF_GAIN: set_bit(TX_RF_GAIN, &dev->flags); break; default: dev_dbg(&intf->dev, "unknown ctrl: id=%d name=%s\n", ctrl->id, ctrl->name); ret = -EINVAL; goto err; } ret = hackrf_set_params(dev); if (ret) goto err; return 0; err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } static const struct v4l2_ctrl_ops hackrf_ctrl_ops_rx = { .s_ctrl = hackrf_s_ctrl_rx, }; static const struct v4l2_ctrl_ops hackrf_ctrl_ops_tx = { .s_ctrl = hackrf_s_ctrl_tx, }; static int hackrf_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct hackrf_dev *dev; int ret; u8 u8tmp, buf[BUF_SIZE]; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { ret = -ENOMEM; goto err; } mutex_init(&dev->v4l2_lock); mutex_init(&dev->vb_queue_lock); spin_lock_init(&dev->buffer_list_lock); INIT_LIST_HEAD(&dev->rx_buffer_list); INIT_LIST_HEAD(&dev->tx_buffer_list); dev->intf = intf; dev->dev = &intf->dev; dev->udev = interface_to_usbdev(intf); dev->pixelformat = formats[0].pixelformat; dev->buffersize = formats[0].buffersize; dev->f_adc = bands_adc_dac[0].rangelow; dev->f_dac = bands_adc_dac[0].rangelow; dev->f_rx = bands_rx_tx[0].rangelow; dev->f_tx = bands_rx_tx[0].rangelow; set_bit(RX_ADC_FREQUENCY, &dev->flags); set_bit(TX_DAC_FREQUENCY, &dev->flags); set_bit(RX_RF_FREQUENCY, &dev->flags); set_bit(TX_RF_FREQUENCY, &dev->flags); /* Detect device */ ret = hackrf_ctrl_msg(dev, CMD_BOARD_ID_READ, 0, 0, &u8tmp, 1); if (ret == 0) ret = hackrf_ctrl_msg(dev, CMD_VERSION_STRING_READ, 0, 0, buf, BUF_SIZE); if (ret) { dev_err(dev->dev, "Could not detect board\n"); goto err_kfree; } buf[BUF_SIZE - 1] = '\0'; dev_info(dev->dev, "Board ID: %02x\n", u8tmp); dev_info(dev->dev, "Firmware version: %s\n", buf); /* Init vb2 queue structure for receiver */ dev->rx_vb2_queue.type = V4L2_BUF_TYPE_SDR_CAPTURE; dev->rx_vb2_queue.io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF | VB2_READ; dev->rx_vb2_queue.ops = &hackrf_vb2_ops; dev->rx_vb2_queue.mem_ops = &vb2_vmalloc_memops; dev->rx_vb2_queue.drv_priv = dev; dev->rx_vb2_queue.buf_struct_size = sizeof(struct hackrf_buffer); dev->rx_vb2_queue.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; ret = vb2_queue_init(&dev->rx_vb2_queue); if (ret) { dev_err(dev->dev, "Could not initialize rx vb2 queue\n"); goto err_kfree; } /* Init vb2 queue structure for transmitter */ dev->tx_vb2_queue.type = V4L2_BUF_TYPE_SDR_OUTPUT; dev->tx_vb2_queue.io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF | VB2_WRITE; dev->tx_vb2_queue.ops = &hackrf_vb2_ops; dev->tx_vb2_queue.mem_ops = &vb2_vmalloc_memops; dev->tx_vb2_queue.drv_priv = dev; dev->tx_vb2_queue.buf_struct_size = sizeof(struct hackrf_buffer); dev->tx_vb2_queue.timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; ret = vb2_queue_init(&dev->tx_vb2_queue); if (ret) { dev_err(dev->dev, "Could not initialize tx vb2 queue\n"); goto err_kfree; } /* Register controls for receiver */ v4l2_ctrl_handler_init(&dev->rx_ctrl_handler, 5); dev->rx_bandwidth_auto = v4l2_ctrl_new_std(&dev->rx_ctrl_handler, &hackrf_ctrl_ops_rx, V4L2_CID_RF_TUNER_BANDWIDTH_AUTO, 0, 1, 0, 1); dev->rx_bandwidth = v4l2_ctrl_new_std(&dev->rx_ctrl_handler, &hackrf_ctrl_ops_rx, V4L2_CID_RF_TUNER_BANDWIDTH, 1750000, 28000000, 50000, 1750000); v4l2_ctrl_auto_cluster(2, &dev->rx_bandwidth_auto, 0, false); dev->rx_rf_gain = v4l2_ctrl_new_std(&dev->rx_ctrl_handler, &hackrf_ctrl_ops_rx, V4L2_CID_RF_TUNER_RF_GAIN, 0, 12, 12, 0); dev->rx_lna_gain = v4l2_ctrl_new_std(&dev->rx_ctrl_handler, &hackrf_ctrl_ops_rx, V4L2_CID_RF_TUNER_LNA_GAIN, 0, 40, 8, 0); dev->rx_if_gain = v4l2_ctrl_new_std(&dev->rx_ctrl_handler, &hackrf_ctrl_ops_rx, V4L2_CID_RF_TUNER_IF_GAIN, 0, 62, 2, 0); if (dev->rx_ctrl_handler.error) { ret = dev->rx_ctrl_handler.error; dev_err(dev->dev, "Could not initialize controls\n"); goto err_v4l2_ctrl_handler_free_rx; } v4l2_ctrl_grab(dev->rx_rf_gain, !hackrf_enable_rf_gain_ctrl); v4l2_ctrl_handler_setup(&dev->rx_ctrl_handler); /* Register controls for transmitter */ v4l2_ctrl_handler_init(&dev->tx_ctrl_handler, 4); dev->tx_bandwidth_auto = v4l2_ctrl_new_std(&dev->tx_ctrl_handler, &hackrf_ctrl_ops_tx, V4L2_CID_RF_TUNER_BANDWIDTH_AUTO, 0, 1, 0, 1); dev->tx_bandwidth = v4l2_ctrl_new_std(&dev->tx_ctrl_handler, &hackrf_ctrl_ops_tx, V4L2_CID_RF_TUNER_BANDWIDTH, 1750000, 28000000, 50000, 1750000); v4l2_ctrl_auto_cluster(2, &dev->tx_bandwidth_auto, 0, false); dev->tx_lna_gain = v4l2_ctrl_new_std(&dev->tx_ctrl_handler, &hackrf_ctrl_ops_tx, V4L2_CID_RF_TUNER_LNA_GAIN, 0, 47, 1, 0); dev->tx_rf_gain = v4l2_ctrl_new_std(&dev->tx_ctrl_handler, &hackrf_ctrl_ops_tx, V4L2_CID_RF_TUNER_RF_GAIN, 0, 15, 15, 0); if (dev->tx_ctrl_handler.error) { ret = dev->tx_ctrl_handler.error; dev_err(dev->dev, "Could not initialize controls\n"); goto err_v4l2_ctrl_handler_free_tx; } v4l2_ctrl_grab(dev->tx_rf_gain, !hackrf_enable_rf_gain_ctrl); v4l2_ctrl_handler_setup(&dev->tx_ctrl_handler); /* Register the v4l2_device structure */ dev->v4l2_dev.release = hackrf_video_release; ret = v4l2_device_register(&intf->dev, &dev->v4l2_dev); if (ret) { dev_err(dev->dev, "Failed to register v4l2-device (%d)\n", ret); goto err_v4l2_ctrl_handler_free_tx; } /* Init video_device structure for receiver */ dev->rx_vdev = hackrf_template; dev->rx_vdev.queue = &dev->rx_vb2_queue; dev->rx_vdev.queue->lock = &dev->vb_queue_lock; dev->rx_vdev.v4l2_dev = &dev->v4l2_dev; dev->rx_vdev.ctrl_handler = &dev->rx_ctrl_handler; dev->rx_vdev.lock = &dev->v4l2_lock; dev->rx_vdev.vfl_dir = VFL_DIR_RX; dev->rx_vdev.device_caps = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE | V4L2_CAP_SDR_CAPTURE | V4L2_CAP_TUNER; video_set_drvdata(&dev->rx_vdev, dev); ret = video_register_device(&dev->rx_vdev, VFL_TYPE_SDR, -1); if (ret) { dev_err(dev->dev, "Failed to register as video device (%d)\n", ret); goto err_v4l2_device_unregister; } dev_info(dev->dev, "Registered as %s\n", video_device_node_name(&dev->rx_vdev)); /* Init video_device structure for transmitter */ dev->tx_vdev = hackrf_template; dev->tx_vdev.queue = &dev->tx_vb2_queue; dev->tx_vdev.queue->lock = &dev->vb_queue_lock; dev->tx_vdev.v4l2_dev = &dev->v4l2_dev; dev->tx_vdev.ctrl_handler = &dev->tx_ctrl_handler; dev->tx_vdev.lock = &dev->v4l2_lock; dev->tx_vdev.vfl_dir = VFL_DIR_TX; dev->tx_vdev.device_caps = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE | V4L2_CAP_SDR_OUTPUT | V4L2_CAP_MODULATOR; video_set_drvdata(&dev->tx_vdev, dev); ret = video_register_device(&dev->tx_vdev, VFL_TYPE_SDR, -1); if (ret) { dev_err(dev->dev, "Failed to register as video device (%d)\n", ret); goto err_video_unregister_device_rx; } dev_info(dev->dev, "Registered as %s\n", video_device_node_name(&dev->tx_vdev)); dev_notice(dev->dev, "SDR API is still slightly experimental and functionality changes may follow\n"); return 0; err_video_unregister_device_rx: video_unregister_device(&dev->rx_vdev); err_v4l2_device_unregister: v4l2_device_unregister(&dev->v4l2_dev); err_v4l2_ctrl_handler_free_tx: v4l2_ctrl_handler_free(&dev->tx_ctrl_handler); err_v4l2_ctrl_handler_free_rx: v4l2_ctrl_handler_free(&dev->rx_ctrl_handler); err_kfree: kfree(dev); err: dev_dbg(&intf->dev, "failed=%d\n", ret); return ret; } /* USB device ID list */ static const struct usb_device_id hackrf_id_table[] = { { USB_DEVICE(0x1d50, 0x6089) }, /* HackRF One */ { } }; MODULE_DEVICE_TABLE(usb, hackrf_id_table); /* USB subsystem interface */ static struct usb_driver hackrf_driver = { .name = KBUILD_MODNAME, .probe = hackrf_probe, .disconnect = hackrf_disconnect, .id_table = hackrf_id_table, }; module_usb_driver(hackrf_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("HackRF"); MODULE_LICENSE("GPL");
88 87 88 88 88 87 88 88 88 88 88 88 88 88 231 60 62 55 10 62 187 188 149 187 80 144 138 139 137 138 43 43 1 1 1 1 1 89 89 89 88 89 89 89 89 88 89 88 89 89 89 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 3 3 3 3 3 3 3 88 3 3 3 89 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 88 88 91 88 88 88 88 88 88 87 91 90 90 89 87 89 89 3 89 88 87 88 87 88 3 87 86 88 3 6 6 86 87 87 2 3 3 89 4 88 87 88 88 86 88 87 88 87 88 4 4 5 5 2 2 34 34 34 34 11 91 3 3 91 84 84 84 84 84 92 84 84 84 84 84 92 100 100 12 89 89 89 90 90 90 89 90 90 90 90 89 90 90 90 89 90 90 90 1 90 90 1 90 90 90 88 89 90 90 89 90 90 90 90 90 90 89 90 90 90 90 90 90 90 90 90 90 90 90 90 90 1 1 90 90 89 90 90 90 90 90 90 90 90 90 90 90 89 90 90 90 89 89 90 90 2 90 2 90 90 90 90 90 90 90 100 99 98 98 4 101 90 11 12 12 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/timer.h> #include <linux/rtnetlink.h> #include <net/codel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "sta_info.h" #include "debugfs_sta.h" #include "mesh.h" #include "wme.h" /** * DOC: STA information lifetime rules * * STA info structures (&struct sta_info) are managed in a hash table * for faster lookup and a list for iteration. They are managed using * RCU, i.e. access to the list and hash table is protected by RCU. * * Upon allocating a STA info structure with sta_info_alloc(), the caller * owns that structure. It must then insert it into the hash table using * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that * the caller may not do much with the STA info before inserting it; in * particular, it may not start any mesh peer link management or add * encryption keys. * * When the insertion fails (sta_info_insert()) returns non-zero), the * structure will have been freed by sta_info_insert()! * * Station entries are added by mac80211 when you establish a link with a * peer. This means different things for the different type of interfaces * we support. For a regular station this mean we add the AP sta when we * receive an association response from the AP. For IBSS this occurs when * get to know about a peer on the same IBSS. For WDS we add the sta for * the peer immediately upon device open. When using AP mode we add stations * for each respective station upon request from userspace through nl80211. * * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * * There is no concept of ownership on a STA entry; each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. */ struct sta_link_alloc { struct link_sta_info info; struct ieee80211_link_sta sta; struct rcu_head rcu_head; }; static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static const struct rhashtable_params link_sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct link_sta_info, link_hash_node), .key_offset = offsetof(struct link_sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_remove(&local->sta_hash, &sta->hash_node, sta_rht_params); } static int link_sta_info_hash_add(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_insert(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } static int link_sta_info_hash_del(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_remove(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } void ieee80211_purge_sta_txqs(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; int i; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi; if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); ieee80211_txq_purge(local, txqi); } } static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); } ieee80211_purge_sta_txqs(sta); for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the * driver to finish aggregation stop and then clean up, but for now * drivers have to handle aggregation stop being requested, followed * directly by station destruction. */ for (i = 0; i < IEEE80211_NUM_TIDS; i++) { kfree(sta->ampdu_mlme.tid_start_tx[i]); tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } } static void cleanup_single_sta(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; __cleanup_single_sta(sta); sta_info_free(local, sta); } struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); } /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } /* * Get sta info either from the specified interface * or from one of its vlans */ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->link_sta_hash, addr, link_sta_rht_params); } struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct link_sta_info *link_sta; rcu_read_lock(); for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return link_sta; } } rcu_read_unlock(); return NULL; } struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id) { struct ieee80211_local *local = hw_to_local(hw); struct link_sta_info *link_sta; struct rhlist_head *tmp; for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; struct ieee80211_link_data *link; u8 _link_id = link_sta->link_id; if (!localaddr) { if (link_id) *link_id = _link_id; return &sta->sta; } link = rcu_dereference(sta->sdata->link[_link_id]); if (!link) continue; if (memcmp(link->conf->addr, localaddr, ETH_ALEN)) continue; if (link_id) *link_id = _link_id; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_link_addrs); struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr) { struct rhlist_head *tmp; struct sta_info *sta; for_each_sta_info(local, sta_addr, sta, tmp) { if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) return sta; } return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; int i = 0; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (sdata != sta->sdata) continue; if (i < idx) { ++i; continue; } return sta; } return NULL; } static void sta_info_free_link(struct link_sta_info *link_sta) { free_percpu(link_sta->pcpu_rx_stats); } static void sta_remove_link(struct sta_info *sta, unsigned int link_id, bool unhash) { struct sta_link_alloc *alloc = NULL; struct link_sta_info *link_sta; lockdep_assert_wiphy(sta->local->hw.wiphy); link_sta = rcu_access_pointer(sta->link[link_id]); if (WARN_ON(!link_sta)) return; if (unhash) link_sta_info_hash_del(sta->local, link_sta); if (test_sta_flag(sta, WLAN_STA_INSERTED)) ieee80211_link_sta_debugfs_remove(link_sta); if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); sta->sta.valid_links &= ~BIT(link_id); RCU_INIT_POINTER(sta->link[link_id], NULL); RCU_INIT_POINTER(sta->sta.link[link_id], NULL); if (alloc) { sta_info_free_link(&alloc->info); kfree_rcu(alloc, rcu_head); } ieee80211_sta_recalc_aggregates(&sta->sta); } /** * sta_info_free - free STA * * @local: pointer to the global information * @sta: STA info to free * * This function must undo everything done by sta_info_alloc() * that may happen before sta_info_insert(). It may only be * called when sta_info_insert() has not been attempted (and * if that fails, the station is freed anyway.) */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_access_pointer(sta->link[i]); if (!link_sta) continue; sta_remove_link(sta, i, false); } /* * If we had used sta_info_pre_move_state() then we might not * have gone through the state transitions down again, so do * it here now (and warn if it's inserted). * * This will clear state such as fast TX/RX that may have been * allocated during state transitions. */ while (sta->sta_state > IEEE80211_STA_NONE) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, sta->sta_state - 1); if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) break; } if (sta->rate_ctrl) rate_control_free_sta(sta); sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif sta_info_free_link(&sta->deflink); kfree(sta); } static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_insert(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; local_bh_disable(); if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; return 0; } static int sta_info_alloc_link(struct ieee80211_local *local, struct link_sta_info *link_info, gfp_t gfp) { struct ieee80211_hw *hw = &local->hw; int i; if (ieee80211_hw_check(hw, USES_RSS)) { link_info->pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!link_info->pcpu_rx_stats) return -ENOMEM; } link_info->rx_stats.last_rx = jiffies; u64_stats_init(&link_info->rx_stats.syncp); ewma_signal_init(&link_info->rx_stats_avg.signal); ewma_avg_signal_init(&link_info->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++) ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]); return 0; } static void sta_info_add_link(struct sta_info *sta, unsigned int link_id, struct link_sta_info *link_info, struct ieee80211_link_sta *link_sta) { link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); link_sta->smps_mode = IEEE80211_SMPS_OFF; link_sta->agg.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; } static struct sta_info * __sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, int link_id, const u8 *link_addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; void *txq_data; int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; sta->local = local; sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, &sta->sta.deflink); sta->sta.valid_links = BIT(link_id); } else { sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink); } sta->sta.cur = &sta->sta.deflink.agg; spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (!sdata->u.mesh.user_mpm) timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); memcpy(sta->deflink.addr, link_addr, ETH_ALEN); memcpy(sta->sta.deflink.addr, link_addr, ETH_ALEN); sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; /* TODO link specific alloc and assignments for MLO Link STA */ /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key * prematurely for Tx initialize ptk_idx to an impossible PTK keyid * which always will refer to a NULL key. */ BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); sta->ptk_idx = INVALID_PTK_KEYIDX; ieee80211_init_frag_cache(&sta->frags); sta->sta_state = IEEE80211_STA_NONE; if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) sta->amsdu_mesh_control = -1; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); size = sizeof(struct txq_info) + ALIGN(hw->txq_data_size, sizeof(void *)); txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); if (!txq_data) goto free; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; /* might not do anything for the (bufferable) MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; atomic_set(&sta->airtime[i].aql_tx_pending, 0); sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); for (i = 0; i < NUM_NL80211_BANDS; i++) { u32 mandatory = 0; int r; if (!hw->wiphy->bands[i]) continue; switch (i) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use * for this is when we don't know anything yet and send * management frames, and then we'll pick the lowest * possible rate anyway. * If we don't include _G here, we cannot find a rate * in P2P, and thus trigger the WARN_ONCE() in rate.c */ mandatory = IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: WARN_ON(1); mandatory = 0; break; } for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { struct ieee80211_rate *rate; rate = &hw->wiphy->bands[i]->bitrates[r]; if (!(rate->flags & mandatory)) continue; sta->sta.deflink.supp_rates[i] |= BIT(r); } } sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; sta->cparams.ce_threshold_selector = 0; sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; free_txq: kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif kfree(sta); return NULL; } struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { return __sta_info_alloc(sdata, addr, -1, addr, gfp); } struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp) { return __sta_info_alloc(sdata, mld_addr, link_id, link_addr, gfp); } static int sta_info_insert_check(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Can't be a WARN_ON because it can be triggered through a race: * something inserts a STA (on one CPU) without holding the RTNL * and another CPU turns off the net device. */ if (unlikely(!ieee80211_sdata_running(sdata))) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to * asynchronous resize/rehash. We also require the mutex * for correctness. */ rcu_read_lock(); if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { rcu_read_unlock(); return -ENOTUNIQ; } rcu_read_unlock(); return 0; } static int sta_info_insert_drv_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { enum ieee80211_sta_state state; int err = 0; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { err = drv_sta_state(local, sdata, sta, state, state + 1); if (err) break; } if (!err) { /* * Drivers using legacy sta_add/sta_remove callbacks only * get uploaded set to true after sta_add is called. */ if (!local->ops->sta_add) sta->uploaded = true; return 0; } if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { sdata_info(sdata, "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n", sta->sta.addr, state + 1, err); err = 0; } /* unwind on error */ for (; state > IEEE80211_STA_NOTEXIST; state--) WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1)); return err; } static void ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; bool allow_p2p_go_ps = sdata->vif.p2p; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; if (!sta->sta.support_p2p_ps) { allow_p2p_go_ps = false; break; } } rcu_read_unlock(); if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_P2P_PS); } } static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo = NULL; int err = 0; lockdep_assert_wiphy(local->hw.wiphy); /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_cleanup; } sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); if (!sinfo) { err = -ENOMEM; goto out_cleanup; } local->num_sta++; local->sta_generation++; smp_mb(); /* simplify things and don't accept BA sessions yet */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); /* make the station visible */ err = sta_info_hash_add(local, sta); if (err) goto out_drop_sta; if (sta->sta.valid_links) { err = link_sta_info_hash_add(local, &sta->deflink); if (err) { sta_info_hash_del(local, sta); goto out_drop_sta; } } list_add_tail_rcu(&sta->list, &local->sta_list); /* update channel context before notifying the driver about state * change, this enables driver using the updated channel context right away. */ if (sta->sta_state >= IEEE80211_STA_ASSOC) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } /* notify driver */ err = sta_info_insert_drv_state(local, sdata, sta); if (err) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); if (sta->sta.valid_links) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) continue; ieee80211_link_sta_debugfs_add(link_sta); if (sdata->vif.active_links & BIT(i)) ieee80211_link_sta_debugfs_drv_add(link_sta); } } else { ieee80211_link_sta_debugfs_add(&sta->deflink); ieee80211_link_sta_debugfs_drv_add(&sta->deflink); } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); /* move reference to rcu-protected */ rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); ieee80211_check_fast_xmit(sta); return 0; out_remove: if (sta->sta.valid_links) link_sta_info_hash_del(local, &sta->deflink); sta_info_hash_del(local, sta); list_del_rcu(&sta->list); out_drop_sta: local->num_sta--; synchronize_net(); out_cleanup: cleanup_single_sta(sta); kfree(sinfo); rcu_read_lock(); return err; } int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; int err; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); err = sta_info_insert_check(sta); if (err) { sta_info_free(local, sta); rcu_read_lock(); return err; } return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) { int err = sta_info_insert_rcu(sta); rcu_read_unlock(); return err; } static inline void __bss_tim_set(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __set_bit() format. */ tim[id / 8] |= (1 << (id % 8)); } static inline void __bss_tim_clear(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __clear_bit() format. */ tim[id / 8] &= ~(1 << (id % 8)); } static inline bool __bss_tim_get(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the test_bit() format. */ return tim[id / 8] & (1 << (id % 8)); } static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ switch (ac) { case IEEE80211_AC_VO: return BIT(6) | BIT(7); case IEEE80211_AC_VI: return BIT(4) | BIT(5); case IEEE80211_AC_BE: return BIT(0) | BIT(3); case IEEE80211_AC_BK: return BIT(1) | BIT(2); default: WARN_ON(1); return 0; } } static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) { struct ieee80211_local *local = sta->local; struct ps_data *ps; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (WARN_ON_ONCE(!sta->sdata->bss)) return; ps = &sta->sdata->bss->ps; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; #endif } else { return; } /* No need to do anything if the driver does all */ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) goto done; /* * If all ACs are delivery-enabled then we should build * the TIM bit for all ACs anyway; if only some are then * we ignore those and build the TIM bit using only the * non-enabled ones. */ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_tim = 0; if (ignore_pending) ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac]) continue; indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac]); if (indicate_tim) break; tids = ieee80211_tids_for_ac(ac); indicate_tim |= sta->driver_buffered_tids & tids; indicate_tim |= sta->txq_buffered_tids & tids; } done: spin_lock_bh(&local->tim_lock); if (indicate_tim == __bss_tim_get(ps->tim, id)) goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); else __bss_tim_clear(ps->tim, id); if (local->ops->set_tim && !WARN_ON(sta->dead)) { local->tim_in_locked_section = true; drv_set_tim(local, &sta->sta, indicate_tim); local->tim_in_locked_section = false; } out_unlock: spin_unlock_bh(&local->tim_lock); } void sta_info_recalc_tim(struct sta_info *sta) { __sta_info_recalc_tim(sta, false); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info; int timeout; if (!skb) return false; info = IEEE80211_SKB_CB(skb); /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ timeout = (sta->listen_interval * sta->sdata->vif.bss_conf.beacon_int * 32 / 15625) * HZ; if (timeout < STA_TX_BUFFER_EXPIRE) timeout = STA_TX_BUFFER_EXPIRE; return time_after(jiffies, info->control.jiffies + timeout); } static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local, struct sta_info *sta, int ac) { unsigned long flags; struct sk_buff *skb; /* * First check for frames that should expire on the filtered * queue. Frames here were rejected by the driver and are on * a separate queue to avoid reordering with normal PS-buffered * frames. They also aren't accounted for right now in the * total_ps_buffered counter. */ for (;;) { spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb = skb_peek(&sta->tx_filtered[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->tx_filtered[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); /* * Frames are queued in order, so if this one * hasn't expired yet we can stop testing. If * we actually reached the end of the queue we * also need to stop, of course. */ if (!skb) break; ieee80211_free_txskb(&local->hw, skb); } /* * Now also check the normal PS-buffered queue, this will * only find something if the filtered queue was emptied * since the filtered frames are all before the normal PS * buffered frames. */ for (;;) { spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb = skb_peek(&sta->ps_tx_buf[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->ps_tx_buf[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); /* * frames are queued in order, so if this one * hasn't expired yet (or we reached the end of * the queue) we can stop testing */ if (!skb) break; local->total_ps_buffered--; ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n", sta->sta.addr); ieee80211_free_txskb(&local->hw, skb); } /* * Finally, recalculate the TIM bit for this station -- it might * now be clear because the station was too slow to retrieve its * frames. */ sta_info_recalc_tim(sta); /* * Return whether there are any frames still buffered, this is * used to check whether the cleanup timer still needs to run, * if there are no frames we don't need to rearm the timer. */ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) && skb_queue_empty(&sta->tx_filtered[ac])); } static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, struct sta_info *sta) { bool have_buffered = false; int ac; /* This is only necessary for stations on BSS/MBSS interfaces */ if (!sta->sdata->bss && !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) have_buffered |= sta_info_cleanup_expire_buffered_ac(local, sta, ac); return have_buffered; } static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; int ret, i; might_sleep(); if (!sta) return -ENOENT; local = sta->local; sdata = sta->sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * Before removing the station from the driver and * rate control, it might still start new aggregation * sessions -- block that to make sure the tear-down * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); /* * Before removing the station from the driver there might be pending * rx frames on RSS queues sent prior to the disassociation - wait for * all such frames to be processed. */ drv_sync_rx_queues(local, sta); for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; if (!(sta->sta.valid_links & BIT(i))) continue; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); link_sta_info_hash_del(local, link_sta); } ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; /* * for TDLS peers, make sure to return to the base channel before * removal. */ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) { drv_tdls_cancel_channel_switch(local, sdata, &sta->sta); clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL); } list_del_rcu(&sta->list); sta->removed = true; if (sta->uploaded) drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); return 0; } static int _sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state, bool recalc) { struct ieee80211_local *local = sta->local; might_sleep(); if (sta->sta_state == new_state) return 0; /* check allowed transitions first */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state != IEEE80211_STA_AUTH) return -EINVAL; break; case IEEE80211_STA_AUTH: if (sta->sta_state != IEEE80211_STA_NONE && sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; case IEEE80211_STA_ASSOC: if (sta->sta_state != IEEE80211_STA_AUTH && sta->sta_state != IEEE80211_STA_AUTHORIZED) return -EINVAL; break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; default: WARN(1, "invalid state %d", new_state); return -EINVAL; } sta_dbg(sta->sdata, "moving STA %pM to state %d\n", sta->sta.addr, new_state); /* notify the driver before the actual changes so it can * fail the transition */ if (test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) return err; } /* reflect the change in all state variables */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state == IEEE80211_STA_AUTH) clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); sta->assoc_at = ktime_get_boottime_ns(); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ieee80211_vif_dec_num_mcast(sta->sdata); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); /* * If we have encryption offload, flush (station) queues * (after ensuring concurrent TX completed) so we won't * transmit anything later unencrypted if/when keys are * also removed, which might otherwise happen depending * on how the hardware offload works. */ if (local->ops->set_key) { synchronize_net(); if (local->ops->flush_sta) drv_flush_sta(local, sta->sdata, sta); else ieee80211_flush_queues(local, sta->sdata, false); } ieee80211_clear_fast_xmit(sta); ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state == IEEE80211_STA_ASSOC) { ieee80211_vif_inc_num_mcast(sta->sdata); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sta->sdata->vif.type == NL80211_IFTYPE_AP) cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); break; default: break; } sta->sta_state = new_state; return 0; } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { return _sta_info_move_state(sta, new_state, true); } static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo; int ret; /* * NOTE: This assumes at least synchronize_net() was done * after _part1 and before _part2! */ /* * There's a potential race in _part1 where we set WLAN_STA_BLOCK_BA * but someone might have just gotten past a check, and not yet into * queuing the work/creating the data/etc. * * Do another round of destruction so that the worker is certainly * canceled before we later free the station. * * Since this is after synchronize_rcu()/synchronize_net() we're now * certain that nobody can actually hold a reference to the STA and * be calling e.g. ieee80211_start_tx_ba_session(). */ ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc); WARN_ON_ONCE(ret); } /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); /* disable TIM bit - last chance to tell driver */ __sta_info_recalc_tim(sta, true); sta->dead = true; local->num_sta--; local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc); if (ret) { WARN_ON_ONCE(1); break; } } if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); WARN_ON_ONCE(ret != 0); } sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); ieee80211_destroy_frag_cache(&sta->frags); cleanup_single_sta(sta); } int __must_check __sta_info_destroy(struct sta_info *sta) { int err = __sta_info_destroy_part1(sta); if (err) return err; synchronize_net(); __sta_info_destroy_part2(sta, true); return 0; } int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get(sdata, addr); return __sta_info_destroy(sta); } int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, addr); return __sta_info_destroy(sta); } static void sta_info_cleanup(struct timer_list *t) { struct ieee80211_local *local = from_timer(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) if (sta_info_cleanup_expire_buffered(local, sta)) timer_needed = true; rcu_read_unlock(); if (local->quiescing) return; if (!timer_needed) return; mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } int sta_info_init(struct ieee80211_local *local) { int err; err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; err = rhltable_init(&local->link_sta_hash, &link_sta_rht_params); if (err) { rhltable_destroy(&local->sta_hash); return err; } spin_lock_init(&local->tim_lock); INIT_LIST_HEAD(&local->sta_list); timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); rhltable_destroy(&local->link_sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; LIST_HEAD(free_list); int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); WARN_ON(vlans && !sdata->bss); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { if (sdata != sta->sdata && (!vlans || sdata->bss != sta->sdata->bss)) continue; if (link_id >= 0 && sta->sta.valid_links && !(sta->sta.valid_links & BIT(link_id))) continue; if (!WARN_ON(__sta_info_destroy_part1(sta))) list_add(&sta->free_list, &free_list); ret++; } if (!list_empty(&free_list)) { bool support_p2p_ps = true; synchronize_net(); list_for_each_entry_safe(sta, tmp, &free_list, free_list) { if (!sta->sta.support_p2p_ps) support_p2p_ps = false; __sta_info_destroy_part2(sta, false); } ieee80211_recalc_min_chandef(sdata, -1); if (!support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sdata); } return ret; } void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); if (ieee80211_vif_is_mesh(&sdata->vif) && test_sta_flag(sta, WLAN_STA_PS_STA)) atomic_dec(&sdata->u.mesh.ps.num_sta_ps); WARN_ON(__sta_info_destroy(sta)); } } } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); struct rhlist_head *tmp; struct sta_info *sta; /* * Just return a random station if localaddr is NULL * ... first in list. */ for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; if (!sta->uploaded) return NULL; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) { struct sta_info *sta; if (!vif) return NULL; sta = sta_info_get_bss(vif_to_sdata(vif), addr); if (!sta) return NULL; if (!sta->uploaded) return NULL; return &sta->sta; } EXPORT_SYMBOL(ieee80211_find_sta); /* powersave support code */ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) continue; schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } ieee80211_add_pending_skbs(local, &pending); /* now we're no longer in the deliver code */ clear_sta_flag(sta, WLAN_STA_PS_DELIVER); /* The station might have polled and then woken up before we responded, * so clear these flags now to avoid them sticking around. */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); local->total_ps_buffered -= buffered; sta_info_recalc_tim(sta); ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, bool call_driver, bool more_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); if (more_data) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } } info = IEEE80211_SKB_CB(skb); /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. Also set EOSP to indicate this packet * ends the poll/service period. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; if (call_driver) drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); skb->dev = sdata->dev; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } info->band = chanctx_conf->def.chan->band; ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } static int find_highest_prio_tid(unsigned long tids) { /* lower 3 TIDs aren't ordered perfectly */ if (tids & 0xF8) return fls(tids) - 1; /* TID 0 is BE just like TID 3 */ if (tids & BIT(0)) return 0; return fls(tids) - 1; } /* Indicates if the MORE_DATA bit should be set in the last * frame obtained by ieee80211_sta_ps_get_frames. * Note that driver_release_tids is relevant only if * reason = IEEE80211_FRAME_RELEASE_PSPOLL */ static bool ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, enum ieee80211_frame_release_type reason, unsigned long driver_release_tids) { int ac; /* If the driver has data on more than one TID then * certainly there's more data if we release just a * single frame now (from a single TID). This will * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) return true; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) return true; } return false; } static void ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason, struct sk_buff_head *frames, unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ac; /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; tids = ieee80211_tids_for_ac(ac); /* if we already have frames from software, then we can't also * release from hardware queues */ if (skb_queue_empty(frames)) { *driver_release_tids |= sta->driver_buffered_tids & tids; *driver_release_tids |= sta->txq_buffered_tids & tids; } if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { skb = skb_dequeue(&sta->tx_filtered[ac]); if (!skb) { skb = skb_dequeue( &sta->ps_tx_buf[ac]); if (skb) local->total_ps_buffered--; } if (!skb) break; n_frames--; __skb_queue_tail(frames, skb); } } /* If we have more frames buffered on this AC, then abort the * loop since we can't send more data from other ACs before * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) break; } } static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; unsigned long driver_release_tids = 0; struct sk_buff_head frames; bool more_data; /* Service or PS-Poll period starts */ set_sta_flag(sta, WLAN_STA_SP); __skb_queue_head_init(&frames); ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, &frames, &driver_release_tids); more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid, ac; /* * For PS-Poll, this can only happen due to a race condition * when we set the TIM bit and the station notices it, but * before it can poll for the frame we expire it. * * For uAPSD, this is said in the standard (11.2.1.5 h): * At each unscheduled SP for a non-AP STA, the AP shall * attempt to transmit at least one MSDU or MMPDU, but no * more than the value specified in the Max SP Length field * in the QoS Capability element from delivery-enabled ACs, * that are destined for the non-AP STA. * * Since we have no other MSDU/MMPDU, transmit a QoS null frame. */ /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; bool need_null = false; skb_queue_head_init(&pending); while ((skb = __skb_dequeue(&frames))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qoshdr = NULL; num++; /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; /* * Use MoreData flag to indicate whether there are * more buffered frames for this STA */ if (more_data || !skb_queue_empty(&frames)) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); else hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); tids |= BIT(skb->priority); __skb_queue_tail(&pending, skb); /* end service period after last frame or add one */ if (!skb_queue_empty(&frames)) continue; if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; break; } /* For uAPSD, things are a bit more complicated. If the * last frame has a QoS header (i.e. is a QoS-data or * QoS-nulldata frame) then just set the EOSP bit there * and be done. * If the frame doesn't have a QoS header (which means * it should be a bufferable MMPDU) then we can't set * the EOSP bit in the QoS header; add a QoS-nulldata * frame to the list to send it after the MMPDU. * * Note that this code is only in the mac80211-release * code path, we assume that the driver will not buffer * anything but QoS-data frames, or if it does, will * create the QoS-nulldata frame by itself if needed. * * Cf. 802.11-2012 10.2.1.10 (c). */ if (qoshdr) { *qoshdr |= IEEE80211_QOS_CTL_EOSP; info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; } else { /* The standard isn't completely clear on this * as it says the more-data bit should be set * if there are more BUs. The QoS-Null frame * we're about to send isn't buffered yet, we * only create it below, but let's pretend it * was buffered just in case some clients only * expect more-data=0 when eosp=1. */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); need_null = true; num++; } break; } drv_allow_buffered_frames(local, sta, tids, num, reason, more_data); ieee80211_add_pending_skbs(local, &pending); if (need_null) ieee80211_send_null_response( sta, find_highest_prio_tid(tids), reason, false, false); sta_info_recalc_tim(sta); } else { int tid; /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. * Note that the driver also has to check the number of frames * on the TIDs we're releasing from - if there are more than * n_frames it has to set the more-data bit (if we didn't ask * it to set it anyway due to other buffered frames); if there * are fewer than n_frames it has to make sure to adjust that * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) * became empty or we find that a txq became empty, we'll do the * TIM recalculation. */ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); break; } } } void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) { u8 ignore_for_response = sta->sta.uapsd_queues; /* * If all ACs are delivery-enabled then we should reply * from any of them, if only some are enabled we reply * only from the non-enabled ones. */ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_response = 0; ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response, IEEE80211_FRAME_RELEASE_PSPOLL); } void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta) { int n_frames = sta->sta.max_sp; u8 delivery_enabled = sta->sta.uapsd_queues; /* * If we ever grow support for TSPEC this might happen if * the TSPEC update from hostapd comes in between a trigger * frame setting WLAN_STA_UAPSD in the RX path and this * actually getting called. */ if (!delivery_enabled) return; switch (sta->sta.max_sp) { case 1: n_frames = 2; break; case 2: n_frames = 4; break; case 3: n_frames = 6; break; case 0: /* XXX: what is a good value? */ n_frames = 128; break; } ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled, IEEE80211_FRAME_RELEASE_UAPSD); } void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); trace_api_sta_block_awake(sta->local, pubsta, block); if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_clear_fast_xmit(sta); return; } if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) return; if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { set_sta_flag(sta, WLAN_STA_PS_DELIVER); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || test_sta_flag(sta, WLAN_STA_UAPSD)) { /* must be asleep in this case */ clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; trace_api_eosp(local, pubsta); clear_sta_flag(sta, WLAN_STA_SP); } EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); enum ieee80211_frame_release_type reason; bool more_data; trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); reason = IEEE80211_FRAME_RELEASE_UAPSD; more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, reason, 0); ieee80211_send_null_response(sta, tid, reason, false, more_data); } EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); if (buffered) set_bit(tid, &sta->driver_buffered_tids); else clear_bit(tid, &sta->driver_buffered_tids); sta_info_recalc_tim(sta); } EXPORT_SYMBOL(ieee80211_sta_set_buffered); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->sdata->local; u8 ac = ieee80211_ac_from_tid(tid); u32 airtime = 0; if (sta->local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; if (sta->local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].tx_airtime += tx_airtime; sta->airtime[ac].rx_airtime += rx_airtime; if (ieee80211_sta_keep_active(sta, ac)) sta->airtime[ac].deficit -= airtime; spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { bool first = true; int link_id; if (!sta->sta.valid_links || !sta->sta.mlo) { sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { struct ieee80211_link_sta *link_sta; int i; if (!(active_links & BIT(link_id))) continue; link_sta = rcu_dereference(sta->sta.link[link_id]); if (!link_sta) continue; if (first) { sta->cur = sta->sta.deflink.agg; first = false; continue; } sta->cur.max_amsdu_len = min(sta->cur.max_amsdu_len, link_sta->agg.max_amsdu_len); sta->cur.max_rc_amsdu_len = min(sta->cur.max_rc_amsdu_len, link_sta->agg.max_rc_amsdu_len); for (i = 0; i < ARRAY_SIZE(sta->cur.max_tid_amsdu_len); i++) sta->cur.max_tid_amsdu_len[i] = min(sta->cur.max_tid_amsdu_len[i], link_sta->agg.max_tid_amsdu_len[i]); } rcu_read_unlock(); sta->sta.cur = &sta->cur; } void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed) { int tx_pending; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return; if (!tx_completed) { if (sta) atomic_add(tx_airtime, &sta->airtime[ac].aql_tx_pending); atomic_add(tx_airtime, &local->aql_total_pending_airtime); atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]); return; } if (sta) { tx_pending = atomic_sub_return(tx_airtime, &sta->airtime[ac].aql_tx_pending); if (tx_pending < 0) atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, tx_pending, 0); } atomic_sub(tx_airtime, &local->aql_total_pending_airtime); tx_pending = atomic_sub_return(tx_airtime, &local->aql_ac_pending_airtime[ac]); if (WARN_ONCE(tx_pending < 0, "Device %s AC %d pending airtime underflow: %u, %u", wiphy_name(local->hw.wiphy), ac, tx_pending, tx_airtime)) { atomic_cmpxchg(&local->aql_ac_pending_airtime[ac], tx_pending, 0); atomic_sub(tx_pending, &local->aql_total_pending_airtime); } } static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; int cpu; if (!sta->deflink.pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; } return stats; } static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); switch (STA_STATS_GET(TYPE, rate)) { case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); rinfo->nss = STA_STATS_GET(VHT_NSS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; rinfo->mcs = STA_STATS_GET(HT_MCS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband->bitrates)) break; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) shift = 1; else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } case STA_STATS_RATE_TYPE_HE: rinfo->flags = RATE_INFO_FLAGS_HE_MCS; rinfo->mcs = STA_STATS_GET(HE_MCS, rate); rinfo->nss = STA_STATS_GET(HE_NSS, rate); rinfo->he_gi = STA_STATS_GET(HE_GI, rate); rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; case STA_STATS_RATE_TYPE_EHT: rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); rinfo->nss = STA_STATS_GET(EHT_NSS, rate); rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; sta_stats_decode_rate(sta->local, rate, rinfo); return 0; } static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, int tid) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid) { struct ieee80211_local *local = sta->local; int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, tid); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } if (tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) sinfo->rx_beacon = sdata->deflink.u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->deflink.u.mgd.beacon_loss_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->deflink.tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->deflink.rx_stats.packets; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_packets += cpurxs->packets; } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->deflink.status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->deflink.status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->rx_duration += sta->airtime[ac].rx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_duration += sta->airtime[ac].tx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { sinfo->airtime_weight = sta->airtime_weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->deflink.pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->deflink.rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->deflink.pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && !sta->sta.valid_links && ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) { sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &sinfo->pertid[i], i); } if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | BIT_ULL(NL80211_STA_INFO_PLID) | BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; sinfo->connected_to_as = sta->mesh->connected_to_as; #endif } sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); if (test_sta_flag(sta, WLAN_STA_ASSOC)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->deflink.status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } if (ieee80211_vif_is_mesh(&sdata->vif)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); } } u32 sta_get_expected_throughput(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; /* check if the driver has a SW RC implementation */ if (ref && ref->ops->get_expected_throughput) thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); else thr = drv_get_expected_throughput(local, sta); return thr; } unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); if (!sta->deflink.status_stats.last_ack || time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) return stats->last_rx; return sta->deflink.status_stats.last_ack; } static void sta_update_codel_params(struct sta_info *sta, u32 thr) { if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { sta->cparams.target = MS2TIME(50); sta->cparams.interval = MS2TIME(300); sta->cparams.ecn = false; } else { sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; } } void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); sta_update_codel_params(sta, thr); } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct sta_link_alloc *alloc; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); /* must represent an MLD from the start */ if (WARN_ON(!sta->sta.valid_links)) return -EINVAL; if (WARN_ON(sta->sta.valid_links & BIT(link_id) || sta->link[link_id])) return -EBUSY; alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); if (!alloc) return -ENOMEM; ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL); if (ret) { kfree(alloc); return ret; } sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); ieee80211_link_sta_debugfs_add(&alloc->info); return 0; } void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id) { lockdep_assert_wiphy(sta->sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); sta_remove_link(sta, link_id, false); } int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct link_sta_info *link_sta; u16 old_links = sta->sta.valid_links; u16 new_links = old_links | BIT(link_id); int ret; link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sdata->local->hw.wiphy->mtx)); if (WARN_ON(old_links == new_links || !link_sta)) return -EINVAL; rcu_read_lock(); if (link_sta_info_hash_lookup(sdata->local, link_sta->addr)) { rcu_read_unlock(); return -EALREADY; } /* we only modify under the mutex so this is fine */ rcu_read_unlock(); sta->sta.valid_links = new_links; if (WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) goto hash; ieee80211_recalc_min_chandef(sdata, link_id); /* Ensure the values are updated for the driver, * redone by sta_remove_link on failure. */ ieee80211_sta_recalc_aggregates(&sta->sta); ret = drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, new_links); if (ret) { sta->sta.valid_links = old_links; sta_remove_link(sta, link_id, false); return ret; } hash: ret = link_sta_info_hash_add(sdata->local, link_sta); WARN_ON(ret); return 0; } void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; u16 old_links = sta->sta.valid_links; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta->sta.valid_links &= ~BIT(link_id); if (!WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, sta->sta.valid_links); sta_remove_link(sta, link_id, true); } void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len) { u8 val; sta->sta.max_amsdu_subframes = 0; if (ext_capab_len < 8) return; /* The sender might not have sent the last bit, consider it to be 0 */ val = u8_get_bits(ext_capab[7], WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB); /* we did get all the bits, take the MSB as well */ if (ext_capab_len >= 9) val |= u8_get_bits(ext_capab[8], WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB) << 1; if (val) sta->sta.max_amsdu_subframes = 4 << (4 - val); } #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); return lockdep_is_held(&sta->local->hw.wiphy->mtx); } EXPORT_SYMBOL(lockdep_sta_mutex_held); #endif
63 8158 579 141 242 550 550 804 1535 2426 108 3 75 5054 5056 1100 3058 1069 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMSTAT_H #define _LINUX_VMSTAT_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> #include <linux/static_key.h> #include <linux/mmdebug.h> extern int sysctl_stat_interval; #ifdef CONFIG_NUMA #define ENABLE_NUMA_STAT 1 #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { unsigned nr_dirty; unsigned nr_unqueued_dirty; unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; unsigned nr_demoted; }; /* Stat data for system wide items */ enum vm_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, NR_MEMMAP_PAGES, /* page metadata allocated through buddy allocator */ NR_MEMMAP_BOOT_PAGES, /* page metadata allocated through boot allocator */ NR_VM_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. * * Counters should only be incremented and no critical kernel component * should rely on the counter values. * * Counters are handled completely inline. On many platforms the code * generated will simply be the increment of a global address. */ struct vm_event_state { unsigned long event[NR_VM_EVENT_ITEMS]; }; DECLARE_PER_CPU(struct vm_event_state, vm_event_states); /* * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the * local_irq_disable overhead. */ static inline void __count_vm_event(enum vm_event_item item) { raw_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { raw_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); extern void vm_events_fold_cpu(int cpu); #else /* Disable counters */ static inline void count_vm_event(enum vm_event_item item) { } static inline void count_vm_events(enum vm_event_item item, long delta) { } static inline void __count_vm_event(enum vm_event_item item) { } static inline void __count_vm_events(enum vm_event_item item, long delta) { } static inline void all_vm_events(unsigned long *ret) { } static inline void vm_events_fold_cpu(int cpu) { } #endif /* CONFIG_VM_EVENT_COUNTERS */ #ifdef CONFIG_NUMA_BALANCING #define count_vm_numa_event(x) count_vm_event(x) #define count_vm_numa_events(x, y) count_vm_events(x, y) #else #define count_vm_numa_event(x) do {} while (0) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_DEBUG_TLBFLUSH #define count_vm_tlb_event(x) count_vm_event(x) #define count_vm_tlb_events(x, y) count_vm_events(x, y) #else #define count_vm_tlb_event(x) do {} while (0) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif #ifdef CONFIG_PER_VMA_LOCK_STATS #define count_vm_vma_lock_event(x) count_vm_event(x) #else #define count_vm_vma_lock_event(x) do {} while (0) #endif #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) /* * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #ifdef CONFIG_NUMA static inline void zone_numa_event_add(long x, struct zone *zone, enum numa_stat_item item) { atomic_long_add(x, &zone->vm_numa_event[item]); atomic_long_add(x, &vm_numa_event[item]); } static inline unsigned long zone_numa_event_state(struct zone *zone, enum numa_stat_item item) { return atomic_long_read(&zone->vm_numa_event[item]); } static inline unsigned long global_numa_event_state(enum numa_stat_item item) { return atomic_long_read(&vm_numa_event[item]); } #endif /* CONFIG_NUMA */ static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &vm_zone_stat[item]); } static inline void node_page_state_add(long x, struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_add(x, &pgdat->vm_stat[item]); atomic_long_add(x, &vm_node_stat[item]); } static inline unsigned long global_zone_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_zone_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state_pages(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state(enum node_stat_item item) { VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); return global_node_page_state_pages(item); } static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } /* * More accurate version that also considers the currently pending * deltas. For that we need to loop over all cpus to find the current * deltas. There is no synchronization so the result cannot be * exactly accurate either. */ static inline unsigned long zone_page_state_snapshot(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; #endif return x; } #ifdef CONFIG_NUMA /* See __count_vm_event comment on why raw_cpu_inc is used. */ static inline void __count_numa_event(struct zone *zone, enum numa_stat_item item) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; raw_cpu_inc(pzstats->vm_numa_event[item]); } static inline void __count_numa_events(struct zone *zone, enum numa_stat_item item, long delta) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; raw_cpu_add(pzstats->vm_numa_event[item], delta); } extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); extern unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item); extern void fold_vm_numa_events(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #define node_page_state_pages(node, item) global_node_page_state_pages(item) static inline void fold_vm_numa_events(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); void __inc_node_page_state(struct page *, enum node_stat_item); void __dec_node_page_state(struct page *, enum node_stat_item); void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; int vmstat_refresh(const struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); void set_pgdat_percpu_threshold(pg_data_t *pgdat, int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* * We do not maintain differentials in a single processor configuration. * The functions directly modify the zone and global counters. */ static inline void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { zone_page_state_add(delta, zone, item); } static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { if (vmstat_item_in_bytes(item)) { /* * Only cgroups use subpage accounting right now; at * the global level, these items still change in * multiples of whole pages. Store them as pages * internally to keep the per-cpu counters compact. */ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } node_page_state_add(delta, pgdat, item); } static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_inc(&zone->vm_stat[item]); atomic_long_inc(&vm_zone_stat[item]); } static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_inc(&pgdat->vm_stat[item]); atomic_long_inc(&vm_node_stat[item]); } static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_zone_stat[item]); } static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_dec(&pgdat->vm_stat[item]); atomic_long_dec(&vm_node_stat[item]); } static inline void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } static inline void __inc_node_page_state(struct page *page, enum node_stat_item item) { __inc_node_state(page_pgdat(page), item); } static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } static inline void __dec_node_page_state(struct page *page, enum node_stat_item item) { __dec_node_state(page_pgdat(page), item); } /* * We only use atomic operations to update counters. So there is no need to * disable interrupts. */ #define inc_zone_page_state __inc_zone_page_state #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state #define inc_node_page_state __inc_node_page_state #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state #define inc_zone_state __inc_zone_state #define inc_node_state __inc_node_state #define dec_zone_state __dec_zone_state #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ static inline void __zone_stat_mod_folio(struct folio *folio, enum zone_stat_item item, long nr) { __mod_zone_page_state(folio_zone(folio), item, nr); } static inline void __zone_stat_add_folio(struct folio *folio, enum zone_stat_item item) { __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); } static inline void __zone_stat_sub_folio(struct folio *folio, enum zone_stat_item item) { __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); } static inline void zone_stat_mod_folio(struct folio *folio, enum zone_stat_item item, long nr) { mod_zone_page_state(folio_zone(folio), item, nr); } static inline void zone_stat_add_folio(struct folio *folio, enum zone_stat_item item) { mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio)); } static inline void zone_stat_sub_folio(struct folio *folio, enum zone_stat_item item) { mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio)); } static inline void __node_stat_mod_folio(struct folio *folio, enum node_stat_item item, long nr) { __mod_node_page_state(folio_pgdat(folio), item, nr); } static inline void __node_stat_add_folio(struct folio *folio, enum node_stat_item item) { __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); } static inline void __node_stat_sub_folio(struct folio *folio, enum node_stat_item item) { __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } static inline void node_stat_mod_folio(struct folio *folio, enum node_stat_item item, long nr) { mod_node_page_state(folio_pgdat(folio), item, nr); } static inline void node_stat_add_folio(struct folio *folio, enum node_stat_item item) { mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio)); } static inline void node_stat_sub_folio(struct folio *folio, enum node_stat_item item) { mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio)); } extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) { return vmstat_text[item]; } #ifdef CONFIG_NUMA static inline const char *numa_stat_name(enum numa_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + item]; } #endif /* CONFIG_NUMA */ static inline const char *node_stat_name(enum node_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + item]; } static inline const char *lru_list_name(enum lru_list lru) { return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ #ifdef CONFIG_MEMCG void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_state(lruvec, idx, val); local_irq_restore(flags); } void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __lruvec_stat_mod_folio(folio, idx, val); local_irq_restore(flags); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { lruvec_stat_mod_folio(page_folio(page), idx, val); } #else static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { __mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { mod_node_page_state(page_pgdat(page), idx, val); } #endif /* CONFIG_MEMCG */ static inline void __lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); } static inline void __lruvec_stat_sub_folio(struct folio *folio, enum node_stat_item idx) { __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); } static inline void lruvec_stat_sub_folio(struct folio *folio, enum node_stat_item idx) { lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } void memmap_boot_pages_add(long delta); void memmap_pages_add(long delta); #endif /* _LINUX_VMSTAT_H */
1 1 1 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("ip6tables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_FILTER, }; static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static bool forward = true; module_param(forward, bool, 0000); static int ip6table_filter_table_init(struct net *net) { struct ip6t_replace *repl; int err; repl = ip6t_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ip6t_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } static int __net_init ip6table_filter_net_init(struct net *net) { if (!forward) return ip6table_filter_table_init(net); return 0; } static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { ip6t_unregister_table_pre_exit(net, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "filter"); } static struct pernet_operations ip6table_filter_net_ops = { .init = ip6table_filter_net_init, .pre_exit = ip6table_filter_net_pre_exit, .exit = ip6table_filter_net_exit, }; static int __init ip6table_filter_init(void) { int ret = xt_register_template(&packet_filter, ip6table_filter_table_init); if (ret < 0) return ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); if (IS_ERR(filter_ops)) { xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); } ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) { xt_unregister_template(&packet_filter); kfree(filter_ops); return ret; } return ret; } static void __exit ip6table_filter_fini(void) { unregister_pernet_subsys(&ip6table_filter_net_ops); xt_unregister_template(&packet_filter); kfree(filter_ops); } module_init(ip6table_filter_init); module_exit(ip6table_filter_fini);
4 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/lapb.h> static void lapb_t1timer_expiry(struct timer_list *); static void lapb_t2timer_expiry(struct timer_list *); void lapb_start_t1timer(struct lapb_cb *lapb) { del_timer(&lapb->t1timer); lapb->t1timer.function = lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; lapb->t1timer_running = true; add_timer(&lapb->t1timer); } void lapb_start_t2timer(struct lapb_cb *lapb) { del_timer(&lapb->t2timer); lapb->t2timer.function = lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; lapb->t2timer_running = true; add_timer(&lapb->t2timer); } void lapb_stop_t1timer(struct lapb_cb *lapb) { lapb->t1timer_running = false; del_timer(&lapb->t1timer); } void lapb_stop_t2timer(struct lapb_cb *lapb) { lapb->t2timer_running = false; del_timer(&lapb->t2timer); } int lapb_t1timer_running(struct lapb_cb *lapb) { return lapb->t1timer_running; } static void lapb_t2timer_expiry(struct timer_list *t) { struct lapb_cb *lapb = from_timer(lapb, t, t2timer); spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */ goto out; if (!lapb->t2timer_running) /* The timer has been stopped */ goto out; if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; lapb_timeout_response(lapb); } lapb->t2timer_running = false; out: spin_unlock_bh(&lapb->lock); } static void lapb_t1timer_expiry(struct timer_list *t) { struct lapb_cb *lapb = from_timer(lapb, t, t1timer); spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */ goto out; if (!lapb->t1timer_running) /* The timer has been stopped */ goto out; switch (lapb->state) { /* * If we are a DCE, send DM up to N2 times, then switch to * STATE_1 and send SABM(E). */ case LAPB_STATE_0: if (lapb->mode & LAPB_DCE && lapb->n2count != lapb->n2) { lapb->n2count++; lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE); } else { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } break; /* * Awaiting connection state, send SABM(E), up to N2 times. */ case LAPB_STATE_1: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; if (lapb->mode & LAPB_EXTENDED) { lapb_dbg(1, "(%p) S1 TX SABME(1)\n", lapb->dev); lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND); } else { lapb_dbg(1, "(%p) S1 TX SABM(1)\n", lapb->dev); lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND); } } break; /* * Awaiting disconnection state, send DISC, up to N2 times. */ case LAPB_STATE_2: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; lapb_dbg(1, "(%p) S2 TX DISC(1)\n", lapb->dev); lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); } break; /* * Data transfer state, restransmit I frames, up to N2 times. */ case LAPB_STATE_3: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_stop_t2timer(lapb); lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; lapb_requeue_frames(lapb); lapb_kick(lapb); } break; /* * Frame reject state, restransmit FRMR frames, up to N2 times. */ case LAPB_STATE_4: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; lapb_transmit_frmr(lapb); } break; } lapb_start_t1timer(lapb); out: spin_unlock_bh(&lapb->lock); }
74 75 70 70 42 42 42 5 5 36 36 107 75 7 7 1 6 6 26 25 1 25 24 41 41 3 3 4 4 1 3 2 8 2 2 3 3 3 3 1 1 1 2 2 2 2 2 1 31 18 16 57 42 105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * inode.c */ /* * This file implements code to create and read inodes from disk. * * Inodes in Squashfs are identified by a 48-bit inode which encodes the * location of the compressed metadata block containing the inode, and the byte * offset into that block where the inode is placed (<block, offset>). * * To maximise compression there are different inodes for each file type * (regular file, directory, device, etc.), the inode contents and length * varying with the type. * * To further maximise compression, two types of regular file inode and * directory inode are defined: inodes optimised for frequently occurring * regular files and directories, and extended types where extra * information has to be stored. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/xattr.h> #include <linux/pagemap.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "xattr.h" /* * Initialise VFS inode with the base inode information common to all * Squashfs inode types. Sqsh_ino contains the unswapped base inode * off disk. */ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, struct squashfs_base_inode *sqsh_ino) { uid_t i_uid; gid_t i_gid; int err; inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); if (inode->i_ino == 0) return -EINVAL; err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid); if (err) return err; i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); inode_set_atime(inode, inode_get_mtime_sec(inode), 0); inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; return err; } struct inode *squashfs_iget(struct super_block *sb, long long ino, unsigned int ino_number) { struct inode *inode = iget_locked(sb, ino_number); int err; TRACE("Entered squashfs_iget\n"); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; err = squashfs_read_inode(inode, ino); if (err) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } /* * Initialise VFS inode by reading inode from inode table (compressed * metadata). The format and amount of data read depends on type. */ int squashfs_read_inode(struct inode *inode, long long ino) { struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; int err, type, offset = SQUASHFS_INODE_OFFSET(ino); union squashfs_inode squashfs_ino; struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; int xattr_id = SQUASHFS_INVALID_XATTR; TRACE("Entered squashfs_read_inode\n"); /* * Read inode base common to all inode types. */ err = squashfs_read_metadata(sb, sqshb_ino, &block, &offset, sizeof(*sqshb_ino)); if (err < 0) goto failed_read; err = squashfs_new_inode(sb, inode, sqshb_ino); if (err) goto failed_read; block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; offset = SQUASHFS_INODE_OFFSET(ino); type = le16_to_cpu(sqshb_ino->inode_type); switch (type) { case SQUASHFS_REG_TYPE: { unsigned int frag_offset, frag; int frag_size; u64 frag_blk; struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { err = frag_size; goto failed_read; } } else { frag_blk = SQUASHFS_INVALID_BLK; frag_size = 0; frag_offset = 0; } set_nlink(inode, 1); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; squashfs_i(inode)->fragment_offset = frag_offset; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, block, offset); break; } case SQUASHFS_LREG_TYPE: { unsigned int frag_offset, frag; int frag_size; u64 frag_blk; struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { err = frag_size; goto failed_read; } } else { frag_blk = SQUASHFS_INVALID_BLK; frag_size = 0; frag_offset = 0; } xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = (inode->i_size - le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; squashfs_i(inode)->fragment_offset = frag_offset; squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, block, offset); break; } case SQUASHFS_DIR_TYPE: { struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le16_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; inode->i_mode |= S_IFDIR; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); squashfs_i(inode)->dir_idx_cnt = 0; squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, le16_to_cpu(sqsh_ino->offset)); break; } case SQUASHFS_LDIR_TYPE: { struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; inode->i_mode |= S_IFDIR; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); squashfs_i(inode)->dir_idx_start = block; squashfs_i(inode)->dir_idx_offset = offset; squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); TRACE("Long directory inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, le16_to_cpu(sqsh_ino->offset)); break; } case SQUASHFS_SYMLINK_TYPE: case SQUASHFS_LSYMLINK_TYPE: { struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); if (inode->i_size > PAGE_SIZE) { ERROR("Corrupted symlink\n"); return -EINVAL; } set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_op = &squashfs_symlink_inode_ops; inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; if (type == SQUASHFS_LSYMLINK_TYPE) { __le32 xattr; err = squashfs_read_metadata(sb, NULL, &block, &offset, inode->i_size); if (err < 0) goto failed_read; err = squashfs_read_metadata(sb, &xattr, &block, &offset, sizeof(xattr)); if (err < 0) goto failed_read; xattr_id = le32_to_cpu(xattr); } TRACE("Symbolic link inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, block, offset); break; } case SQUASHFS_BLKDEV_TYPE: case SQUASHFS_CHRDEV_TYPE: { struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; unsigned int rdev; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_CHRDEV_TYPE) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); break; } case SQUASHFS_LBLKDEV_TYPE: case SQUASHFS_LCHRDEV_TYPE: { struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev; unsigned int rdev; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_LCHRDEV_TYPE) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); break; } case SQUASHFS_FIFO_TYPE: case SQUASHFS_SOCKET_TYPE: { struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_FIFO_TYPE) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } case SQUASHFS_LFIFO_TYPE: case SQUASHFS_LSOCKET_TYPE: { struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_LFIFO_TYPE) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } default: ERROR("Unknown inode type %d in squashfs_iget!\n", type); return -EINVAL; } if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) { err = squashfs_xattr_lookup(sb, xattr_id, &squashfs_i(inode)->xattr_count, &squashfs_i(inode)->xattr_size, &squashfs_i(inode)->xattr); if (err < 0) goto failed_read; inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9) + 1; } else squashfs_i(inode)->xattr_count = 0; return 0; failed_read: ERROR("Unable to read inode 0x%llx\n", ino); return err; } const struct inode_operations squashfs_inode_ops = { .listxattr = squashfs_listxattr };
23 23 23 21 23 23 23 4 4 4 4 4 4 3 1 1 4 4 4 4 4 4 4 4 8 4 3 5 5 3 1 1 3 2 2 4 4 4 4 4 4 4 8 8 1 24 24 24 24 1 24 3 3 24 24 7 7 23 23 23 23 23 15 8 8 23 23 22 22 14 3 14 14 13 1 8 8 8 8 21 20 13 21 23 15 15 3 2 3 3 24 24 17 17 25 9 19 25 10 25 24 24 33 31 30 32 31 8 25 31 31 31 31 32 23 32 33 33 16 17 17 33 27 25 25 33 16 17 30 33 17 17 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 // SPDX-License-Identifier: GPL-2.0 #include <linux/blkdev.h> #include <linux/iversion.h> #include "ctree.h" #include "fs.h" #include "messages.h" #include "compression.h" #include "delalloc-space.h" #include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" #include "accessors.h" #include "file-item.h" #include "file.h" #include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, const u64 olen, int no_time_update) { int ret; inode_inc_iversion(inode); if (!no_time_update) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. */ if (endoff > destoff + olen) endoff = destoff + olen; if (endoff > inode->i_size) { i_size_write(inode, endoff); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } ret = btrfs_end_transaction(trans); out: return ret; } static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, const u64 datal, const u8 comp_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 block_size = fs_info->sectorsize; const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); /* * We have flushed and locked the ranges of the source and destination * inodes, we also have locked the inodes, so we are safe to do a * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, block_size); if (ret) goto out; folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, btrfs_alloc_write_mask(mapping)); if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; /* * After dirtying the page our caller will need to start a transaction, * and if we are low on metadata free space, that can cause flushing of * delalloc for all inodes in order to get metadata space released. * However we are holding the range locked for the whole duration of * the clone/dedupe operation, so we may deadlock if that happens and no * other task releases enough space. So mark this inode as not being * possible to flush to avoid such deadlock. We will clear that flag * when we finish cloning all extents, since a transaction is started * after finding each extent to clone. */ set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, datal); } else { ret = btrfs_decompress(comp_type, data_start, folio, offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; flush_dcache_folio(folio); } /* * If our inline data is smaller then the block/page size, then the * remaining of the block/page is equivalent to zeroes. We had something * like the following done: * * $ xfs_io -f -c "pwrite -S 0xab 0 500" file * $ sync # (or fsync) * $ xfs_io -c "falloc 0 4K" file * $ xfs_io -c "pwrite -S 0xcd 4K 4K" * * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) folio_zero_range(folio, datal, block_size - datal); btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: if (!IS_ERR(folio)) { folio_unlock(folio); folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, block_size, true); btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); return ret; } /* * Deal with cloning of inline extents. We try to copy the inline extent from * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, const u64 datal, const u64 size, const u8 comp_type, char *inline_data, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_drop_extents_args drop_args = { 0 }; int ret; struct btrfs_key key; if (new_key->offset > 0) { ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; } key.objectid = btrfs_ino(BTRFS_I(dst)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { return ret; } else if (ret > 0) { if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) return ret; else if (ret > 0) goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == btrfs_ino(BTRFS_I(dst)) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the * inline extent's data to the page. */ ASSERT(key.offset > 0); goto copy_to_page; } } else if (i_size_read(dst) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); /* * If it's an inline extent replace it with the source inline * extent, otherwise copy the source inline extent data into * the respective page at the destination inode. */ if (btrfs_file_extent_type(path->nodes[0], ei) == BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; goto copy_to_page; } copy_inline_extent: /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ if (i_size_read(dst) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ goto copy_to_page; } /* * Release path before starting a new transaction so we don't hold locks * that would confuse lockdep. */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf * of the destination inode. We know we will drop or adjust at most one * extent item in the destination root. * * 1 unit - adjusting old extent (we may have to split it) * 1 unit - add new extent * 1 unit - inode update */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } drop_args.path = path; drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); if (ret) goto out; write_extent_buffer(path->nodes[0], inline_data, btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); btrfs_set_inode_full_sync(BTRFS_I(dst)); ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); out: if (!ret && !trans) { /* * No transaction here means we copied the inline extent into a * page of the destination inode. * * 1 unit to update inode item */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; } } if (ret && trans) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); } if (!ret) *trans_out = trans; return ret; copy_to_page: /* * Release our path because we don't need it anymore and also because * copy_inline_to_page() needs to reserve data and metadata, which may * need to flush delalloc when we are low on available space and * therefore cause a deadlock if writeback of an inline extent needs to * write to the same leaf or an ordered extent completion needs to write * to the same leaf. */ btrfs_release_path(path); ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; } /* * Clone a range from inode file to another. * * @src: Inode to clone from * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, int no_time_update) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_path *path = NULL; struct extent_buffer *leaf; struct btrfs_trans_handle *trans; char *buf = NULL; struct btrfs_key key; u32 nritems; int slot; int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!buf) return ret; path = btrfs_alloc_path(); if (!path) { kvfree(buf); return ret; } path->reada = READA_FORWARD; /* Clone data */ key.objectid = btrfs_ino(BTRFS_I(src)); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = off; while (1) { struct btrfs_file_extent_item *extent; u64 extent_gen; int type; u32 size; struct btrfs_key new_key; u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; u64 drop_start; /* Note the key will change type as we walk through the tree */ ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 0, 0); if (ret < 0) goto out; /* * First search, if no extent item that starts at offset off was * found but the previous item is an extent item, it's possible * it might overlap our target range, therefore process it. */ if (key.offset == off && ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } nritems = btrfs_header_nritems(path->nodes[0]); process_slot: if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) goto out; if (ret > 0) break; nritems = btrfs_header_nritems(path->nodes[0]); } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.type > BTRFS_EXTENT_DATA_KEY || key.objectid != btrfs_ino(BTRFS_I(src))) break; ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); extent_gen = btrfs_file_extent_generation(leaf, extent); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { disko = btrfs_file_extent_disk_bytenr(leaf, extent); diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); datao = btrfs_file_extent_offset(leaf, extent); datal = btrfs_file_extent_num_bytes(leaf, extent); } else if (type == BTRFS_FILE_EXTENT_INLINE) { /* Take upper bound, may be compressed */ datal = btrfs_file_extent_ram_bytes(leaf, extent); } /* * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. * * Subsequent searches may leave us on a file range we have * processed before - this happens due to a race with ordered * extent completion for a file range that is outside our source * range, but that range was part of a file extent item that * also covered a leading part of our source range. */ if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); btrfs_release_path(path); memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = btrfs_ino(BTRFS_I(inode)); if (off <= key.offset) new_key.offset = key.offset + destoff - off; else new_key.offset = destoff; /* * Deal with a hole that doesn't have an extent item that * represents it (NO_HOLES feature enabled). * This hole is either in the middle of the cloning range or at * the beginning (fully overlaps it or partially overlaps it). */ if (new_key.offset != last_dest_end) drop_start = last_dest_end; else drop_start = new_key.offset; if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b * | ------------- extent ------------- | */ /* Subtract range b */ if (key.offset + datal > off + len) datal = off + len - key.offset; /* Subtract range a */ if (off > key.offset) { datao += off - key.offset; datal -= off - key.offset; } clone_info.disk_offset = disko; clone_info.disk_len = diskl; clone_info.data_offset = datao; clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) goto out; } else { ASSERT(type == BTRFS_FILE_EXTENT_INLINE); /* * Inline extents always have to start at file offset 0 * and can never be bigger then the sector size. We can * never clone only parts of an inline extent, since all * reflink operations must start at a sector size aligned * offset, and the length must be aligned too or end at * the i_size (which implies the whole inlined data). */ ASSERT(key.offset == 0); ASSERT(datal <= fs_info->sectorsize); if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || WARN_ON(key.offset != 0) || WARN_ON(datal > fs_info->sectorsize)) { ret = -EUCLEAN; goto out; } ret = clone_copy_inline_extent(inode, path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) goto out; } btrfs_release_path(path); /* * Whenever we share an extent we update the last_reflink_trans * of each inode to the current transaction. This is needed to * make sure fsync does not log multiple checksum items with * overlapping ranges (because some extent items might refer * only to sections of the original extent). For the destination * inode we do this regardless of the generation of the extents * or even if they are inline extents or explicit holes, to make * sure a full fsync does not skip them. For the source inode, * we only need to update last_reflink_trans in case it's a new * extent that is not a hole or an inline extent, to deal with * the checksums problem on fsync. */ if (extent_gen == trans->transid && disko > 0) BTRFS_I(src)->last_reflink_trans = trans->transid; BTRFS_I(inode)->last_reflink_trans = trans->transid; last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, destoff, olen, no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) break; btrfs_release_path(path); key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); } ret = 0; if (last_dest_end < destoff + len) { /* * We have an implicit hole that fully or partially overlaps our * cloning range at its end. This means that we either have the * NO_HOLES feature enabled or the implicit hole happened due to * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); /* * When using NO_HOLES and we are cloning a range that covers * only a hole (no extents) into a range beyond the current * i_size, punching a hole in the target range will not create * an extent map defining a hole, because the range starts at or * beyond current i_size. If the file previously had an i_size * greater than the new i_size set by this clone operation, we * need to make sure the next fsync is a full fsync, so that it * detects and logs a hole covering a range from the current * i_size to the new i_size. If the clone range covers extents, * besides a hole, then we know the full sync flag was already * set by previous calls to btrfs_replace_file_extents() that * replaced file extent items. */ if (last_dest_end >= i_size_read(inode)) btrfs_set_inode_full_sync(BTRFS_I(inode)); ret = btrfs_replace_file_extents(BTRFS_I(inode), path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; ret = clone_finish_inode_update(trans, inode, destoff + len, destoff, olen, no_time_update); } out: btrfs_free_path(path); kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); return ret; } static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); down_write(&BTRFS_I(inode1)->i_mmap_lock); down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); } static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) { up_write(&BTRFS_I(inode1)->i_mmap_lock); up_write(&BTRFS_I(inode2)->i_mmap_lock); } static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; /* * Lock destination range to serialize with concurrent readahead(), and * we are safe from concurrency with relocation of source extents * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); return ret; } static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret = 0; u64 i, tail_len, chunk_count; struct btrfs_root *root_dst = BTRFS_I(dst)->root; spin_lock(&root_dst->root_item_lock); if (root_dst->send_in_progress) { btrfs_warn_rl(root_dst->fs_info, "cannot deduplicate to root %llu while send operations are using it (%d in progress)", btrfs_root_id(root_dst), root_dst->send_in_progress); spin_unlock(&root_dst->root_item_lock); return -EAGAIN; } root_dst->dedupe_in_progress++; spin_unlock(&root_dst->root_item_lock); tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, dst, dst_loff); if (ret) goto out; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; } if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; spin_unlock(&root_dst->root_item_lock); return ret; } static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 off, u64 olen, u64 destoff) { struct extent_state *cached_state = NULL; struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); int ret; int wb_ret; u64 len = olen; u64 bs = fs_info->sectorsize; u64 end; /* * VFS's generic_remap_file_range_prep() protects us from cloning the * eof block into the middle of a file, which would result in corruption * if the file size is not blocksize aligned. So we don't need to check * for that case here. */ if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; if (destoff > inode->i_size) { const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); if (ret) return ret; /* * We may have truncated the last block if the inode's size is * not sector size aligned, so we need to wait for writeback to * complete before proceeding further, otherwise we can race * with cloning and attempt to increment a reference to an * extent that no longer exists (writeback completed right after * we found the previous extent covering eof and before we * attempted to increment its reference count). */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), wb_start, destoff - wb_start); if (ret) return ret; } /* * Lock destination range to serialize with concurrent readahead(), and * we are safe from concurrency with relocation of source extents * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ end = destoff + len - 1; lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination * range, so wait for writeback to complete before truncating pages * from the page cache. This is a rare case. */ wb_ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); ret = ret ? ret : wb_ret; /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. */ truncate_inode_pages_range(&inode->i_data, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); btrfs_btree_balance_dirty(fs_info); return ret; } static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { struct btrfs_root *root_out = BTRFS_I(inode_out)->root; if (btrfs_root_readonly(root_out)) return -EROFS; ASSERT(inode_in->i_sb == inode_out->i_sb); } /* Don't make the dst file partly checksummed */ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } /* * Now that the inodes are locked, we need to start writeback ourselves * and can not rely on the writeback from the VFS's generic helper * generic_remap_file_range_prep() because: * * 1) For compression we must call filemap_fdatawrite_range() range * twice (btrfs_fdatawrite_range() does it for us), and the generic * helper only calls it once; * * 2) filemap_fdatawrite_range(), called by the generic helper only * waits for the writeback to complete, i.e. for IO to be done, and * not for the ordered extents to complete. We need to wait for them * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they * work at the whole extent level. * NOCOW buffered write without data space reserved may not be able * to fall back to CoW due to lack of data space, thus could cause * data loss. * * Here we take a shortcut by flushing the whole inode, so that all * nocow write should reach disk as nocow before we increase the * reference of the extent. We could do better by only flushing NOCOW * data, but that needs extra accounting. * * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ ret = filemap_flush(inode_in->i_mapping); if (ret < 0) return ret; ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags); } static bool file_sync_write(const struct file *file) { if (file->f_flags & (__O_SYNC | O_DSYNC)) return true; if (IS_SYNC(file_inode(file))) return true; return false; } loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { struct inode *src_inode = file_inode(src_file); struct inode *dst_inode = file_inode(dst_file); bool same_inode = dst_inode == src_inode; int ret; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; if (same_inode) { btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { lock_two_nondirectories(src_inode, dst_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); if (ret < 0 || len == 0) goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); } /* * If either the source or the destination file was opened with O_SYNC, * O_DSYNC or has the S_SYNC attribute, fsync both the destination and * source files/ranges, so that after a successful return (0) followed * by a power failure results in the reflinked data to be readable from * both files/ranges. */ if (ret == 0 && len > 0 && (file_sync_write(src_file) || file_sync_write(dst_file))) { ret = btrfs_sync_file(src_file, off, off + len - 1, 0); if (ret == 0) ret = btrfs_sync_file(dst_file, destoff, destoff + len - 1, 0); } return ret < 0 ? ret : len; }
17 17 17 17 17 17 17 17 17 17 17 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005-2006 Dell Inc. * * Serial Attached SCSI (SAS) transport class. * * The SAS transport class contains common code to deal with SAS HBAs, * an aproximated representation of SAS topologies in the driver model, * and various sysfs attributes to expose these topologies and management * interfaces to userspace. * * In addition to the basic SCSI core objects this transport class * introduces two additional intermediate objects: The SAS PHY * as represented by struct sas_phy defines an "outgoing" PHY on * a SAS HBA or Expander, and the SAS remote PHY represented by * struct sas_rphy defines an "incoming" PHY on a SAS Expander or * end device. Note that this is purely a software concept, the * underlying hardware for a PHY and a remote PHY is the exactly * the same. * * There is no concept of a SAS port in this code, users can see * what PHYs form a wide port based on the port_identifier attribute, * which is the same for all PHYs in a port. */ #include <linux/init.h> #include <linux/module.h> #include <linux/jiffies.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/blkdev.h> #include <linux/bsg.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_transport_sas.h> #include "scsi_sas_internal.h" struct sas_host_attrs { struct list_head rphy_list; struct mutex lock; struct request_queue *q; u32 next_target_id; u32 next_expander_id; int next_port_id; }; #define to_sas_host_attrs(host) ((struct sas_host_attrs *)(host)->shost_data) /* * Hack to allow attributes of the same name in different objects. */ #define SAS_DEVICE_ATTR(_prefix,_name,_mode,_show,_store) \ struct device_attribute dev_attr_##_prefix##_##_name = \ __ATTR(_name,_mode,_show,_store) /* * Pretty printing helpers */ #define sas_bitfield_name_match(title, table) \ static ssize_t \ get_sas_##title##_names(u32 table_key, char *buf) \ { \ char *prefix = ""; \ ssize_t len = 0; \ int i; \ \ for (i = 0; i < ARRAY_SIZE(table); i++) { \ if (table[i].value & table_key) { \ len += sprintf(buf + len, "%s%s", \ prefix, table[i].name); \ prefix = ", "; \ } \ } \ len += sprintf(buf + len, "\n"); \ return len; \ } #define sas_bitfield_name_set(title, table) \ static ssize_t \ set_sas_##title##_names(u32 *table_key, const char *buf) \ { \ ssize_t len = 0; \ int i; \ \ for (i = 0; i < ARRAY_SIZE(table); i++) { \ len = strlen(table[i].name); \ if (strncmp(buf, table[i].name, len) == 0 && \ (buf[len] == '\n' || buf[len] == '\0')) { \ *table_key = table[i].value; \ return 0; \ } \ } \ return -EINVAL; \ } #define sas_bitfield_name_search(title, table) \ static ssize_t \ get_sas_##title##_names(u32 table_key, char *buf) \ { \ ssize_t len = 0; \ int i; \ \ for (i = 0; i < ARRAY_SIZE(table); i++) { \ if (table[i].value == table_key) { \ len += sprintf(buf + len, "%s", \ table[i].name); \ break; \ } \ } \ len += sprintf(buf + len, "\n"); \ return len; \ } static struct { u32 value; char *name; } sas_device_type_names[] = { { SAS_PHY_UNUSED, "unused" }, { SAS_END_DEVICE, "end device" }, { SAS_EDGE_EXPANDER_DEVICE, "edge expander" }, { SAS_FANOUT_EXPANDER_DEVICE, "fanout expander" }, }; sas_bitfield_name_search(device_type, sas_device_type_names) static struct { u32 value; char *name; } sas_protocol_names[] = { { SAS_PROTOCOL_SATA, "sata" }, { SAS_PROTOCOL_SMP, "smp" }, { SAS_PROTOCOL_STP, "stp" }, { SAS_PROTOCOL_SSP, "ssp" }, }; sas_bitfield_name_match(protocol, sas_protocol_names) static struct { u32 value; char *name; } sas_linkspeed_names[] = { { SAS_LINK_RATE_UNKNOWN, "Unknown" }, { SAS_PHY_DISABLED, "Phy disabled" }, { SAS_LINK_RATE_FAILED, "Link Rate failed" }, { SAS_SATA_SPINUP_HOLD, "Spin-up hold" }, { SAS_LINK_RATE_1_5_GBPS, "1.5 Gbit" }, { SAS_LINK_RATE_3_0_GBPS, "3.0 Gbit" }, { SAS_LINK_RATE_6_0_GBPS, "6.0 Gbit" }, { SAS_LINK_RATE_12_0_GBPS, "12.0 Gbit" }, { SAS_LINK_RATE_22_5_GBPS, "22.5 Gbit" }, }; sas_bitfield_name_search(linkspeed, sas_linkspeed_names) sas_bitfield_name_set(linkspeed, sas_linkspeed_names) static struct sas_end_device *sas_sdev_to_rdev(struct scsi_device *sdev) { struct sas_rphy *rphy = target_to_rphy(sdev->sdev_target); struct sas_end_device *rdev; BUG_ON(rphy->identify.device_type != SAS_END_DEVICE); rdev = rphy_to_end_device(rphy); return rdev; } static int sas_smp_dispatch(struct bsg_job *job) { struct Scsi_Host *shost = dev_to_shost(job->dev); struct sas_rphy *rphy = NULL; if (!scsi_is_host_device(job->dev)) rphy = dev_to_rphy(job->dev); if (!job->reply_payload.payload_len) { dev_warn(job->dev, "space for a smp response is missing\n"); bsg_job_done(job, -EINVAL, 0); return 0; } to_sas_internal(shost->transportt)->f->smp_handler(job, shost, rphy); return 0; } static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy) { struct request_queue *q; if (!to_sas_internal(shost->transportt)->f->smp_handler) { printk("%s can't handle SMP requests\n", shost->hostt->name); return 0; } if (rphy) { q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev), NULL, sas_smp_dispatch, NULL, 0); if (IS_ERR(q)) return PTR_ERR(q); rphy->q = q; } else { char name[20]; snprintf(name, sizeof(name), "sas_host%d", shost->host_no); q = bsg_setup_queue(&shost->shost_gendev, name, NULL, sas_smp_dispatch, NULL, 0); if (IS_ERR(q)) return PTR_ERR(q); to_sas_host_attrs(shost)->q = q; } return 0; } /* * SAS host attributes */ static int sas_host_setup(struct transport_container *tc, struct device *dev, struct device *cdev) { struct Scsi_Host *shost = dev_to_shost(dev); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); struct device *dma_dev = shost->dma_dev; INIT_LIST_HEAD(&sas_host->rphy_list); mutex_init(&sas_host->lock); sas_host->next_target_id = 0; sas_host->next_expander_id = 0; sas_host->next_port_id = 0; if (sas_bsg_initialize(shost, NULL)) dev_printk(KERN_ERR, dev, "fail to a bsg device %d\n", shost->host_no); if (dma_dev->dma_mask) { shost->opt_sectors = min_t(unsigned int, shost->max_sectors, dma_opt_mapping_size(dma_dev) >> SECTOR_SHIFT); } return 0; } static int sas_host_remove(struct transport_container *tc, struct device *dev, struct device *cdev) { struct Scsi_Host *shost = dev_to_shost(dev); struct request_queue *q = to_sas_host_attrs(shost)->q; bsg_remove_queue(q); return 0; } static DECLARE_TRANSPORT_CLASS(sas_host_class, "sas_host", sas_host_setup, sas_host_remove, NULL); static int sas_host_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_host_device(dev)) return 0; shost = dev_to_shost(dev); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->t.host_attrs.ac == cont; } static int do_sas_phy_delete(struct device *dev, void *data) { int pass = (int)(unsigned long)data; if (pass == 0 && scsi_is_sas_port(dev)) sas_port_delete(dev_to_sas_port(dev)); else if (pass == 1 && scsi_is_sas_phy(dev)) sas_phy_delete(dev_to_phy(dev)); return 0; } /** * sas_remove_children - tear down a devices SAS data structures * @dev: device belonging to the sas object * * Removes all SAS PHYs and remote PHYs for a given object */ void sas_remove_children(struct device *dev) { device_for_each_child(dev, (void *)0, do_sas_phy_delete); device_for_each_child(dev, (void *)1, do_sas_phy_delete); } EXPORT_SYMBOL(sas_remove_children); /** * sas_remove_host - tear down a Scsi_Host's SAS data structures * @shost: Scsi Host that is torn down * * Removes all SAS PHYs and remote PHYs for a given Scsi_Host and remove the * Scsi_Host as well. * * Note: Do not call scsi_remove_host() on the Scsi_Host any more, as it is * already removed. */ void sas_remove_host(struct Scsi_Host *shost) { sas_remove_children(&shost->shost_gendev); scsi_remove_host(shost); } EXPORT_SYMBOL(sas_remove_host); /** * sas_get_address - return the SAS address of the device * @sdev: scsi device * * Returns the SAS address of the scsi device */ u64 sas_get_address(struct scsi_device *sdev) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); return rdev->rphy.identify.sas_address; } EXPORT_SYMBOL(sas_get_address); /** * sas_tlr_supported - checking TLR bit in vpd 0x90 * @sdev: scsi device struct * * Check Transport Layer Retries are supported or not. * If vpd page 0x90 is present, TRL is supported. * */ unsigned int sas_tlr_supported(struct scsi_device *sdev) { const int vpd_len = 32; struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); char *buffer = kzalloc(vpd_len, GFP_KERNEL); int ret = 0; if (!buffer) goto out; if (scsi_get_vpd_page(sdev, 0x90, buffer, vpd_len)) goto out; /* * Magic numbers: the VPD Protocol page (0x90) * has a 4 byte header and then one entry per device port * the TLR bit is at offset 8 on each port entry * if we take the first port, that's at total offset 12 */ ret = buffer[12] & 0x01; out: kfree(buffer); rdev->tlr_supported = ret; return ret; } EXPORT_SYMBOL_GPL(sas_tlr_supported); /** * sas_disable_tlr - setting TLR flags * @sdev: scsi device struct * * Seting tlr_enabled flag to 0. * */ void sas_disable_tlr(struct scsi_device *sdev) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); rdev->tlr_enabled = 0; } EXPORT_SYMBOL_GPL(sas_disable_tlr); /** * sas_enable_tlr - setting TLR flags * @sdev: scsi device struct * * Seting tlr_enabled flag 1. * */ void sas_enable_tlr(struct scsi_device *sdev) { unsigned int tlr_supported = 0; tlr_supported = sas_tlr_supported(sdev); if (tlr_supported) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); rdev->tlr_enabled = 1; } return; } EXPORT_SYMBOL_GPL(sas_enable_tlr); unsigned int sas_is_tlr_enabled(struct scsi_device *sdev) { struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); return rdev->tlr_enabled; } EXPORT_SYMBOL_GPL(sas_is_tlr_enabled); /** * sas_ata_ncq_prio_supported - Check for ATA NCQ command priority support * @sdev: SCSI device * * Check if an ATA device supports NCQ priority using VPD page 89h (ATA * Information). Since this VPD page is implemented only for ATA devices, * this function always returns false for SCSI devices. */ bool sas_ata_ncq_prio_supported(struct scsi_device *sdev) { struct scsi_vpd *vpd; bool ncq_prio_supported = false; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg89); if (vpd && vpd->len >= 214) ncq_prio_supported = (vpd->data[213] >> 4) & 1; rcu_read_unlock(); return ncq_prio_supported; } EXPORT_SYMBOL_GPL(sas_ata_ncq_prio_supported); /* * SAS Phy attributes */ #define sas_phy_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_phy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ \ return snprintf(buf, 20, format_string, cast phy->field); \ } #define sas_phy_simple_attr(field, name, format_string, type) \ sas_phy_show_simple(field, name, format_string, (type)) \ static DEVICE_ATTR(name, S_IRUGO, show_sas_phy_##name, NULL) #define sas_phy_show_protocol(field, name) \ static ssize_t \ show_sas_phy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ \ if (!phy->field) \ return snprintf(buf, 20, "none\n"); \ return get_sas_protocol_names(phy->field, buf); \ } #define sas_phy_protocol_attr(field, name) \ sas_phy_show_protocol(field, name) \ static DEVICE_ATTR(name, S_IRUGO, show_sas_phy_##name, NULL) #define sas_phy_show_linkspeed(field) \ static ssize_t \ show_sas_phy_##field(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ \ return get_sas_linkspeed_names(phy->field, buf); \ } /* Fudge to tell if we're minimum or maximum */ #define sas_phy_store_linkspeed(field) \ static ssize_t \ store_sas_phy_##field(struct device *dev, \ struct device_attribute *attr, \ const char *buf, size_t count) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); \ struct sas_internal *i = to_sas_internal(shost->transportt); \ u32 value; \ struct sas_phy_linkrates rates = {0}; \ int error; \ \ error = set_sas_linkspeed_names(&value, buf); \ if (error) \ return error; \ rates.field = value; \ error = i->f->set_phy_speed(phy, &rates); \ \ return error ? error : count; \ } #define sas_phy_linkspeed_rw_attr(field) \ sas_phy_show_linkspeed(field) \ sas_phy_store_linkspeed(field) \ static DEVICE_ATTR(field, S_IRUGO, show_sas_phy_##field, \ store_sas_phy_##field) #define sas_phy_linkspeed_attr(field) \ sas_phy_show_linkspeed(field) \ static DEVICE_ATTR(field, S_IRUGO, show_sas_phy_##field, NULL) #define sas_phy_show_linkerror(field) \ static ssize_t \ show_sas_phy_##field(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_phy *phy = transport_class_to_phy(dev); \ struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); \ struct sas_internal *i = to_sas_internal(shost->transportt); \ int error; \ \ error = i->f->get_linkerrors ? i->f->get_linkerrors(phy) : 0; \ if (error) \ return error; \ return snprintf(buf, 20, "%u\n", phy->field); \ } #define sas_phy_linkerror_attr(field) \ sas_phy_show_linkerror(field) \ static DEVICE_ATTR(field, S_IRUGO, show_sas_phy_##field, NULL) static ssize_t show_sas_device_type(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_phy *phy = transport_class_to_phy(dev); if (!phy->identify.device_type) return snprintf(buf, 20, "none\n"); return get_sas_device_type_names(phy->identify.device_type, buf); } static DEVICE_ATTR(device_type, S_IRUGO, show_sas_device_type, NULL); static ssize_t do_sas_phy_enable(struct device *dev, size_t count, int enable) { struct sas_phy *phy = transport_class_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); int error; error = i->f->phy_enable(phy, enable); if (error) return error; phy->enabled = enable; return count; }; static ssize_t store_sas_phy_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { if (count < 1) return -EINVAL; switch (buf[0]) { case '0': do_sas_phy_enable(dev, count, 0); break; case '1': do_sas_phy_enable(dev, count, 1); break; default: return -EINVAL; } return count; } static ssize_t show_sas_phy_enable(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_phy *phy = transport_class_to_phy(dev); return snprintf(buf, 20, "%d\n", phy->enabled); } static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, show_sas_phy_enable, store_sas_phy_enable); static ssize_t do_sas_phy_reset(struct device *dev, size_t count, int hard_reset) { struct sas_phy *phy = transport_class_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); int error; error = i->f->phy_reset(phy, hard_reset); if (error) return error; phy->enabled = 1; return count; }; static ssize_t store_sas_link_reset(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return do_sas_phy_reset(dev, count, 0); } static DEVICE_ATTR(link_reset, S_IWUSR, NULL, store_sas_link_reset); static ssize_t store_sas_hard_reset(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return do_sas_phy_reset(dev, count, 1); } static DEVICE_ATTR(hard_reset, S_IWUSR, NULL, store_sas_hard_reset); sas_phy_protocol_attr(identify.initiator_port_protocols, initiator_port_protocols); sas_phy_protocol_attr(identify.target_port_protocols, target_port_protocols); sas_phy_simple_attr(identify.sas_address, sas_address, "0x%016llx\n", unsigned long long); sas_phy_simple_attr(identify.phy_identifier, phy_identifier, "%d\n", u8); sas_phy_linkspeed_attr(negotiated_linkrate); sas_phy_linkspeed_attr(minimum_linkrate_hw); sas_phy_linkspeed_rw_attr(minimum_linkrate); sas_phy_linkspeed_attr(maximum_linkrate_hw); sas_phy_linkspeed_rw_attr(maximum_linkrate); sas_phy_linkerror_attr(invalid_dword_count); sas_phy_linkerror_attr(running_disparity_error_count); sas_phy_linkerror_attr(loss_of_dword_sync_count); sas_phy_linkerror_attr(phy_reset_problem_count); static int sas_phy_setup(struct transport_container *tc, struct device *dev, struct device *cdev) { struct sas_phy *phy = dev_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); if (i->f->phy_setup) i->f->phy_setup(phy); return 0; } static DECLARE_TRANSPORT_CLASS(sas_phy_class, "sas_phy", sas_phy_setup, NULL, NULL); static int sas_phy_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_sas_phy(dev)) return 0; shost = dev_to_shost(dev->parent); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->phy_attr_cont.ac == cont; } static void sas_phy_release(struct device *dev) { struct sas_phy *phy = dev_to_phy(dev); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); if (i->f->phy_release) i->f->phy_release(phy); put_device(dev->parent); kfree(phy); } /** * sas_phy_alloc - allocates and initialize a SAS PHY structure * @parent: Parent device * @number: Phy index * * Allocates an SAS PHY structure. It will be added in the device tree * below the device specified by @parent, which has to be either a Scsi_Host * or sas_rphy. * * Returns: * SAS PHY allocated or %NULL if the allocation failed. */ struct sas_phy *sas_phy_alloc(struct device *parent, int number) { struct Scsi_Host *shost = dev_to_shost(parent); struct sas_phy *phy; phy = kzalloc(sizeof(*phy), GFP_KERNEL); if (!phy) return NULL; phy->number = number; phy->enabled = 1; device_initialize(&phy->dev); phy->dev.parent = get_device(parent); phy->dev.release = sas_phy_release; INIT_LIST_HEAD(&phy->port_siblings); if (scsi_is_sas_expander_device(parent)) { struct sas_rphy *rphy = dev_to_rphy(parent); dev_set_name(&phy->dev, "phy-%d:%d:%d", shost->host_no, rphy->scsi_target_id, number); } else dev_set_name(&phy->dev, "phy-%d:%d", shost->host_no, number); transport_setup_device(&phy->dev); return phy; } EXPORT_SYMBOL(sas_phy_alloc); /** * sas_phy_add - add a SAS PHY to the device hierarchy * @phy: The PHY to be added * * Publishes a SAS PHY to the rest of the system. */ int sas_phy_add(struct sas_phy *phy) { int error; error = device_add(&phy->dev); if (error) return error; error = transport_add_device(&phy->dev); if (error) { device_del(&phy->dev); return error; } transport_configure_device(&phy->dev); return 0; } EXPORT_SYMBOL(sas_phy_add); /** * sas_phy_free - free a SAS PHY * @phy: SAS PHY to free * * Frees the specified SAS PHY. * * Note: * This function must only be called on a PHY that has not * successfully been added using sas_phy_add(). */ void sas_phy_free(struct sas_phy *phy) { transport_destroy_device(&phy->dev); put_device(&phy->dev); } EXPORT_SYMBOL(sas_phy_free); /** * sas_phy_delete - remove SAS PHY * @phy: SAS PHY to remove * * Removes the specified SAS PHY. If the SAS PHY has an * associated remote PHY it is removed before. */ void sas_phy_delete(struct sas_phy *phy) { struct device *dev = &phy->dev; /* this happens if the phy is still part of a port when deleted */ BUG_ON(!list_empty(&phy->port_siblings)); transport_remove_device(dev); device_del(dev); transport_destroy_device(dev); put_device(dev); } EXPORT_SYMBOL(sas_phy_delete); /** * scsi_is_sas_phy - check if a struct device represents a SAS PHY * @dev: device to check * * Returns: * %1 if the device represents a SAS PHY, %0 else */ int scsi_is_sas_phy(const struct device *dev) { return dev->release == sas_phy_release; } EXPORT_SYMBOL(scsi_is_sas_phy); /* * SAS Port attributes */ #define sas_port_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_port_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_port *port = transport_class_to_sas_port(dev); \ \ return snprintf(buf, 20, format_string, cast port->field); \ } #define sas_port_simple_attr(field, name, format_string, type) \ sas_port_show_simple(field, name, format_string, (type)) \ static DEVICE_ATTR(name, S_IRUGO, show_sas_port_##name, NULL) sas_port_simple_attr(num_phys, num_phys, "%d\n", int); static DECLARE_TRANSPORT_CLASS(sas_port_class, "sas_port", NULL, NULL, NULL); static int sas_port_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_sas_port(dev)) return 0; shost = dev_to_shost(dev->parent); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->port_attr_cont.ac == cont; } static void sas_port_release(struct device *dev) { struct sas_port *port = dev_to_sas_port(dev); BUG_ON(!list_empty(&port->phy_list)); put_device(dev->parent); kfree(port); } static void sas_port_create_link(struct sas_port *port, struct sas_phy *phy) { int res; res = sysfs_create_link(&port->dev.kobj, &phy->dev.kobj, dev_name(&phy->dev)); if (res) goto err; res = sysfs_create_link(&phy->dev.kobj, &port->dev.kobj, "port"); if (res) goto err; return; err: printk(KERN_ERR "%s: Cannot create port links, err=%d\n", __func__, res); } static void sas_port_delete_link(struct sas_port *port, struct sas_phy *phy) { sysfs_remove_link(&port->dev.kobj, dev_name(&phy->dev)); sysfs_remove_link(&phy->dev.kobj, "port"); } /** sas_port_alloc - allocate and initialize a SAS port structure * * @parent: parent device * @port_id: port number * * Allocates a SAS port structure. It will be added to the device tree * below the device specified by @parent which must be either a Scsi_Host * or a sas_expander_device. * * Returns %NULL on error */ struct sas_port *sas_port_alloc(struct device *parent, int port_id) { struct Scsi_Host *shost = dev_to_shost(parent); struct sas_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return NULL; port->port_identifier = port_id; device_initialize(&port->dev); port->dev.parent = get_device(parent); port->dev.release = sas_port_release; mutex_init(&port->phy_list_mutex); INIT_LIST_HEAD(&port->phy_list); if (scsi_is_sas_expander_device(parent)) { struct sas_rphy *rphy = dev_to_rphy(parent); dev_set_name(&port->dev, "port-%d:%d:%d", shost->host_no, rphy->scsi_target_id, port->port_identifier); } else dev_set_name(&port->dev, "port-%d:%d", shost->host_no, port->port_identifier); transport_setup_device(&port->dev); return port; } EXPORT_SYMBOL(sas_port_alloc); /** sas_port_alloc_num - allocate and initialize a SAS port structure * * @parent: parent device * * Allocates a SAS port structure and a number to go with it. This * interface is really for adapters where the port number has no * meansing, so the sas class should manage them. It will be added to * the device tree below the device specified by @parent which must be * either a Scsi_Host or a sas_expander_device. * * Returns %NULL on error */ struct sas_port *sas_port_alloc_num(struct device *parent) { int index; struct Scsi_Host *shost = dev_to_shost(parent); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); /* FIXME: use idr for this eventually */ mutex_lock(&sas_host->lock); if (scsi_is_sas_expander_device(parent)) { struct sas_rphy *rphy = dev_to_rphy(parent); struct sas_expander_device *exp = rphy_to_expander_device(rphy); index = exp->next_port_id++; } else index = sas_host->next_port_id++; mutex_unlock(&sas_host->lock); return sas_port_alloc(parent, index); } EXPORT_SYMBOL(sas_port_alloc_num); /** * sas_port_add - add a SAS port to the device hierarchy * @port: port to be added * * publishes a port to the rest of the system */ int sas_port_add(struct sas_port *port) { int error; /* No phys should be added until this is made visible */ BUG_ON(!list_empty(&port->phy_list)); error = device_add(&port->dev); if (error) return error; transport_add_device(&port->dev); transport_configure_device(&port->dev); return 0; } EXPORT_SYMBOL(sas_port_add); /** * sas_port_free - free a SAS PORT * @port: SAS PORT to free * * Frees the specified SAS PORT. * * Note: * This function must only be called on a PORT that has not * successfully been added using sas_port_add(). */ void sas_port_free(struct sas_port *port) { transport_destroy_device(&port->dev); put_device(&port->dev); } EXPORT_SYMBOL(sas_port_free); /** * sas_port_delete - remove SAS PORT * @port: SAS PORT to remove * * Removes the specified SAS PORT. If the SAS PORT has an * associated phys, unlink them from the port as well. */ void sas_port_delete(struct sas_port *port) { struct device *dev = &port->dev; struct sas_phy *phy, *tmp_phy; if (port->rphy) { sas_rphy_delete(port->rphy); port->rphy = NULL; } mutex_lock(&port->phy_list_mutex); list_for_each_entry_safe(phy, tmp_phy, &port->phy_list, port_siblings) { sas_port_delete_link(port, phy); list_del_init(&phy->port_siblings); } mutex_unlock(&port->phy_list_mutex); if (port->is_backlink) { struct device *parent = port->dev.parent; sysfs_remove_link(&port->dev.kobj, dev_name(parent)); port->is_backlink = 0; } transport_remove_device(dev); device_del(dev); transport_destroy_device(dev); put_device(dev); } EXPORT_SYMBOL(sas_port_delete); /** * scsi_is_sas_port - check if a struct device represents a SAS port * @dev: device to check * * Returns: * %1 if the device represents a SAS Port, %0 else */ int scsi_is_sas_port(const struct device *dev) { return dev->release == sas_port_release; } EXPORT_SYMBOL(scsi_is_sas_port); /** * sas_port_get_phy - try to take a reference on a port member * @port: port to check */ struct sas_phy *sas_port_get_phy(struct sas_port *port) { struct sas_phy *phy; mutex_lock(&port->phy_list_mutex); if (list_empty(&port->phy_list)) phy = NULL; else { struct list_head *ent = port->phy_list.next; phy = list_entry(ent, typeof(*phy), port_siblings); get_device(&phy->dev); } mutex_unlock(&port->phy_list_mutex); return phy; } EXPORT_SYMBOL(sas_port_get_phy); /** * sas_port_add_phy - add another phy to a port to form a wide port * @port: port to add the phy to * @phy: phy to add * * When a port is initially created, it is empty (has no phys). All * ports must have at least one phy to operated, and all wide ports * must have at least two. The current code makes no difference * between ports and wide ports, but the only object that can be * connected to a remote device is a port, so ports must be formed on * all devices with phys if they're connected to anything. */ void sas_port_add_phy(struct sas_port *port, struct sas_phy *phy) { mutex_lock(&port->phy_list_mutex); if (unlikely(!list_empty(&phy->port_siblings))) { /* make sure we're already on this port */ struct sas_phy *tmp; list_for_each_entry(tmp, &port->phy_list, port_siblings) if (tmp == phy) break; /* If this trips, you added a phy that was already * part of a different port */ if (unlikely(tmp != phy)) { dev_printk(KERN_ERR, &port->dev, "trying to add phy %s fails: it's already part of another port\n", dev_name(&phy->dev)); BUG(); } } else { sas_port_create_link(port, phy); list_add_tail(&phy->port_siblings, &port->phy_list); port->num_phys++; } mutex_unlock(&port->phy_list_mutex); } EXPORT_SYMBOL(sas_port_add_phy); /** * sas_port_delete_phy - remove a phy from a port or wide port * @port: port to remove the phy from * @phy: phy to remove * * This operation is used for tearing down ports again. It must be * done to every port or wide port before calling sas_port_delete. */ void sas_port_delete_phy(struct sas_port *port, struct sas_phy *phy) { mutex_lock(&port->phy_list_mutex); sas_port_delete_link(port, phy); list_del_init(&phy->port_siblings); port->num_phys--; mutex_unlock(&port->phy_list_mutex); } EXPORT_SYMBOL(sas_port_delete_phy); void sas_port_mark_backlink(struct sas_port *port) { int res; struct device *parent = port->dev.parent->parent->parent; if (port->is_backlink) return; port->is_backlink = 1; res = sysfs_create_link(&port->dev.kobj, &parent->kobj, dev_name(parent)); if (res) goto err; return; err: printk(KERN_ERR "%s: Cannot create port backlink, err=%d\n", __func__, res); } EXPORT_SYMBOL(sas_port_mark_backlink); /* * SAS remote PHY attributes. */ #define sas_rphy_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_rphy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ \ return snprintf(buf, 20, format_string, cast rphy->field); \ } #define sas_rphy_simple_attr(field, name, format_string, type) \ sas_rphy_show_simple(field, name, format_string, (type)) \ static SAS_DEVICE_ATTR(rphy, name, S_IRUGO, \ show_sas_rphy_##name, NULL) #define sas_rphy_show_protocol(field, name) \ static ssize_t \ show_sas_rphy_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ \ if (!rphy->field) \ return snprintf(buf, 20, "none\n"); \ return get_sas_protocol_names(rphy->field, buf); \ } #define sas_rphy_protocol_attr(field, name) \ sas_rphy_show_protocol(field, name) \ static SAS_DEVICE_ATTR(rphy, name, S_IRUGO, \ show_sas_rphy_##name, NULL) static ssize_t show_sas_rphy_device_type(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_rphy *rphy = transport_class_to_rphy(dev); if (!rphy->identify.device_type) return snprintf(buf, 20, "none\n"); return get_sas_device_type_names( rphy->identify.device_type, buf); } static SAS_DEVICE_ATTR(rphy, device_type, S_IRUGO, show_sas_rphy_device_type, NULL); static ssize_t show_sas_rphy_enclosure_identifier(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_rphy *rphy = transport_class_to_rphy(dev); struct sas_phy *phy = dev_to_phy(rphy->dev.parent); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); u64 identifier; int error; error = i->f->get_enclosure_identifier(rphy, &identifier); if (error) return error; return sprintf(buf, "0x%llx\n", (unsigned long long)identifier); } static SAS_DEVICE_ATTR(rphy, enclosure_identifier, S_IRUGO, show_sas_rphy_enclosure_identifier, NULL); static ssize_t show_sas_rphy_bay_identifier(struct device *dev, struct device_attribute *attr, char *buf) { struct sas_rphy *rphy = transport_class_to_rphy(dev); struct sas_phy *phy = dev_to_phy(rphy->dev.parent); struct Scsi_Host *shost = dev_to_shost(phy->dev.parent); struct sas_internal *i = to_sas_internal(shost->transportt); int val; val = i->f->get_bay_identifier(rphy); if (val < 0) return val; return sprintf(buf, "%d\n", val); } static SAS_DEVICE_ATTR(rphy, bay_identifier, S_IRUGO, show_sas_rphy_bay_identifier, NULL); sas_rphy_protocol_attr(identify.initiator_port_protocols, initiator_port_protocols); sas_rphy_protocol_attr(identify.target_port_protocols, target_port_protocols); sas_rphy_simple_attr(identify.sas_address, sas_address, "0x%016llx\n", unsigned long long); sas_rphy_simple_attr(identify.phy_identifier, phy_identifier, "%d\n", u8); sas_rphy_simple_attr(scsi_target_id, scsi_target_id, "%d\n", u32); /* only need 8 bytes of data plus header (4 or 8) */ #define BUF_SIZE 64 int sas_read_port_mode_page(struct scsi_device *sdev) { char *buffer = kzalloc(BUF_SIZE, GFP_KERNEL), *msdata; struct sas_end_device *rdev = sas_sdev_to_rdev(sdev); struct scsi_mode_data mode_data; int error; if (!buffer) return -ENOMEM; error = scsi_mode_sense(sdev, 1, 0x19, 0, buffer, BUF_SIZE, 30*HZ, 3, &mode_data, NULL); if (error) goto out; msdata = buffer + mode_data.header_length + mode_data.block_descriptor_length; if (msdata - buffer > BUF_SIZE - 8) goto out; error = 0; rdev->ready_led_meaning = msdata[2] & 0x10 ? 1 : 0; rdev->I_T_nexus_loss_timeout = (msdata[4] << 8) + msdata[5]; rdev->initiator_response_timeout = (msdata[6] << 8) + msdata[7]; out: kfree(buffer); return error; } EXPORT_SYMBOL(sas_read_port_mode_page); static DECLARE_TRANSPORT_CLASS(sas_end_dev_class, "sas_end_device", NULL, NULL, NULL); #define sas_end_dev_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_end_dev_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ struct sas_end_device *rdev = rphy_to_end_device(rphy); \ \ return snprintf(buf, 20, format_string, cast rdev->field); \ } #define sas_end_dev_simple_attr(field, name, format_string, type) \ sas_end_dev_show_simple(field, name, format_string, (type)) \ static SAS_DEVICE_ATTR(end_dev, name, S_IRUGO, \ show_sas_end_dev_##name, NULL) sas_end_dev_simple_attr(ready_led_meaning, ready_led_meaning, "%d\n", int); sas_end_dev_simple_attr(I_T_nexus_loss_timeout, I_T_nexus_loss_timeout, "%d\n", int); sas_end_dev_simple_attr(initiator_response_timeout, initiator_response_timeout, "%d\n", int); sas_end_dev_simple_attr(tlr_supported, tlr_supported, "%d\n", int); sas_end_dev_simple_attr(tlr_enabled, tlr_enabled, "%d\n", int); static DECLARE_TRANSPORT_CLASS(sas_expander_class, "sas_expander", NULL, NULL, NULL); #define sas_expander_show_simple(field, name, format_string, cast) \ static ssize_t \ show_sas_expander_##name(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct sas_rphy *rphy = transport_class_to_rphy(dev); \ struct sas_expander_device *edev = rphy_to_expander_device(rphy); \ \ return snprintf(buf, 20, format_string, cast edev->field); \ } #define sas_expander_simple_attr(field, name, format_string, type) \ sas_expander_show_simple(field, name, format_string, (type)) \ static SAS_DEVICE_ATTR(expander, name, S_IRUGO, \ show_sas_expander_##name, NULL) sas_expander_simple_attr(vendor_id, vendor_id, "%s\n", char *); sas_expander_simple_attr(product_id, product_id, "%s\n", char *); sas_expander_simple_attr(product_rev, product_rev, "%s\n", char *); sas_expander_simple_attr(component_vendor_id, component_vendor_id, "%s\n", char *); sas_expander_simple_attr(component_id, component_id, "%u\n", unsigned int); sas_expander_simple_attr(component_revision_id, component_revision_id, "%u\n", unsigned int); sas_expander_simple_attr(level, level, "%d\n", int); static DECLARE_TRANSPORT_CLASS(sas_rphy_class, "sas_device", NULL, NULL, NULL); static int sas_rphy_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; if (!scsi_is_sas_rphy(dev)) return 0; shost = dev_to_shost(dev->parent->parent); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->rphy_attr_cont.ac == cont; } static int sas_end_dev_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; struct sas_rphy *rphy; if (!scsi_is_sas_rphy(dev)) return 0; shost = dev_to_shost(dev->parent->parent); rphy = dev_to_rphy(dev); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->end_dev_attr_cont.ac == cont && rphy->identify.device_type == SAS_END_DEVICE; } static int sas_expander_match(struct attribute_container *cont, struct device *dev) { struct Scsi_Host *shost; struct sas_internal *i; struct sas_rphy *rphy; if (!scsi_is_sas_rphy(dev)) return 0; shost = dev_to_shost(dev->parent->parent); rphy = dev_to_rphy(dev); if (!shost->transportt) return 0; if (shost->transportt->host_attrs.ac.class != &sas_host_class.class) return 0; i = to_sas_internal(shost->transportt); return &i->expander_attr_cont.ac == cont && (rphy->identify.device_type == SAS_EDGE_EXPANDER_DEVICE || rphy->identify.device_type == SAS_FANOUT_EXPANDER_DEVICE); } static void sas_expander_release(struct device *dev) { struct sas_rphy *rphy = dev_to_rphy(dev); struct sas_expander_device *edev = rphy_to_expander_device(rphy); put_device(dev->parent); kfree(edev); } static void sas_end_device_release(struct device *dev) { struct sas_rphy *rphy = dev_to_rphy(dev); struct sas_end_device *edev = rphy_to_end_device(rphy); put_device(dev->parent); kfree(edev); } /** * sas_rphy_initialize - common rphy initialization * @rphy: rphy to initialise * * Used by both sas_end_device_alloc() and sas_expander_alloc() to * initialise the common rphy component of each. */ static void sas_rphy_initialize(struct sas_rphy *rphy) { INIT_LIST_HEAD(&rphy->list); } /** * sas_end_device_alloc - allocate an rphy for an end device * @parent: which port * * Allocates an SAS remote PHY structure, connected to @parent. * * Returns: * SAS PHY allocated or %NULL if the allocation failed. */ struct sas_rphy *sas_end_device_alloc(struct sas_port *parent) { struct Scsi_Host *shost = dev_to_shost(&parent->dev); struct sas_end_device *rdev; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) { return NULL; } device_initialize(&rdev->rphy.dev); rdev->rphy.dev.parent = get_device(&parent->dev); rdev->rphy.dev.release = sas_end_device_release; if (scsi_is_sas_expander_device(parent->dev.parent)) { struct sas_rphy *rphy = dev_to_rphy(parent->dev.parent); dev_set_name(&rdev->rphy.dev, "end_device-%d:%d:%d", shost->host_no, rphy->scsi_target_id, parent->port_identifier); } else dev_set_name(&rdev->rphy.dev, "end_device-%d:%d", shost->host_no, parent->port_identifier); rdev->rphy.identify.device_type = SAS_END_DEVICE; sas_rphy_initialize(&rdev->rphy); transport_setup_device(&rdev->rphy.dev); return &rdev->rphy; } EXPORT_SYMBOL(sas_end_device_alloc); /** * sas_expander_alloc - allocate an rphy for an end device * @parent: which port * @type: SAS_EDGE_EXPANDER_DEVICE or SAS_FANOUT_EXPANDER_DEVICE * * Allocates an SAS remote PHY structure, connected to @parent. * * Returns: * SAS PHY allocated or %NULL if the allocation failed. */ struct sas_rphy *sas_expander_alloc(struct sas_port *parent, enum sas_device_type type) { struct Scsi_Host *shost = dev_to_shost(&parent->dev); struct sas_expander_device *rdev; struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); BUG_ON(type != SAS_EDGE_EXPANDER_DEVICE && type != SAS_FANOUT_EXPANDER_DEVICE); rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); if (!rdev) { return NULL; } device_initialize(&rdev->rphy.dev); rdev->rphy.dev.parent = get_device(&parent->dev); rdev->rphy.dev.release = sas_expander_release; mutex_lock(&sas_host->lock); rdev->rphy.scsi_target_id = sas_host->next_expander_id++; mutex_unlock(&sas_host->lock); dev_set_name(&rdev->rphy.dev, "expander-%d:%d", shost->host_no, rdev->rphy.scsi_target_id); rdev->rphy.identify.device_type = type; sas_rphy_initialize(&rdev->rphy); transport_setup_device(&rdev->rphy.dev); return &rdev->rphy; } EXPORT_SYMBOL(sas_expander_alloc); /** * sas_rphy_add - add a SAS remote PHY to the device hierarchy * @rphy: The remote PHY to be added * * Publishes a SAS remote PHY to the rest of the system. */ int sas_rphy_add(struct sas_rphy *rphy) { struct sas_port *parent = dev_to_sas_port(rphy->dev.parent); struct Scsi_Host *shost = dev_to_shost(parent->dev.parent); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); struct sas_identify *identify = &rphy->identify; int error; if (parent->rphy) return -ENXIO; parent->rphy = rphy; error = device_add(&rphy->dev); if (error) return error; transport_add_device(&rphy->dev); transport_configure_device(&rphy->dev); if (sas_bsg_initialize(shost, rphy)) printk("fail to a bsg device %s\n", dev_name(&rphy->dev)); mutex_lock(&sas_host->lock); list_add_tail(&rphy->list, &sas_host->rphy_list); if (identify->device_type == SAS_END_DEVICE && (identify->target_port_protocols & (SAS_PROTOCOL_SSP | SAS_PROTOCOL_STP | SAS_PROTOCOL_SATA))) rphy->scsi_target_id = sas_host->next_target_id++; else if (identify->device_type == SAS_END_DEVICE) rphy->scsi_target_id = -1; mutex_unlock(&sas_host->lock); if (identify->device_type == SAS_END_DEVICE && rphy->scsi_target_id != -1) { int lun; if (identify->target_port_protocols & SAS_PROTOCOL_SSP) lun = SCAN_WILD_CARD; else lun = 0; scsi_scan_target(&rphy->dev, 0, rphy->scsi_target_id, lun, SCSI_SCAN_INITIAL); } return 0; } EXPORT_SYMBOL(sas_rphy_add); /** * sas_rphy_free - free a SAS remote PHY * @rphy: SAS remote PHY to free * * Frees the specified SAS remote PHY. * * Note: * This function must only be called on a remote * PHY that has not successfully been added using * sas_rphy_add() (or has been sas_rphy_remove()'d) */ void sas_rphy_free(struct sas_rphy *rphy) { struct device *dev = &rphy->dev; struct Scsi_Host *shost = dev_to_shost(rphy->dev.parent->parent); struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); mutex_lock(&sas_host->lock); list_del(&rphy->list); mutex_unlock(&sas_host->lock); transport_destroy_device(dev); put_device(dev); } EXPORT_SYMBOL(sas_rphy_free); /** * sas_rphy_delete - remove and free SAS remote PHY * @rphy: SAS remote PHY to remove and free * * Removes the specified SAS remote PHY and frees it. */ void sas_rphy_delete(struct sas_rphy *rphy) { sas_rphy_remove(rphy); sas_rphy_free(rphy); } EXPORT_SYMBOL(sas_rphy_delete); /** * sas_rphy_unlink - unlink SAS remote PHY * @rphy: SAS remote phy to unlink from its parent port * * Removes port reference to an rphy */ void sas_rphy_unlink(struct sas_rphy *rphy) { struct sas_port *parent = dev_to_sas_port(rphy->dev.parent); parent->rphy = NULL; } EXPORT_SYMBOL(sas_rphy_unlink); /** * sas_rphy_remove - remove SAS remote PHY * @rphy: SAS remote phy to remove * * Removes the specified SAS remote PHY. */ void sas_rphy_remove(struct sas_rphy *rphy) { struct device *dev = &rphy->dev; switch (rphy->identify.device_type) { case SAS_END_DEVICE: scsi_remove_target(dev); break; case SAS_EDGE_EXPANDER_DEVICE: case SAS_FANOUT_EXPANDER_DEVICE: sas_remove_children(dev); break; default: break; } sas_rphy_unlink(rphy); bsg_remove_queue(rphy->q); transport_remove_device(dev); device_del(dev); } EXPORT_SYMBOL(sas_rphy_remove); /** * scsi_is_sas_rphy - check if a struct device represents a SAS remote PHY * @dev: device to check * * Returns: * %1 if the device represents a SAS remote PHY, %0 else */ int scsi_is_sas_rphy(const struct device *dev) { return dev->release == sas_end_device_release || dev->release == sas_expander_release; } EXPORT_SYMBOL(scsi_is_sas_rphy); /* * SCSI scan helper */ static int sas_user_scan(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct sas_host_attrs *sas_host = to_sas_host_attrs(shost); struct sas_rphy *rphy; mutex_lock(&sas_host->lock); list_for_each_entry(rphy, &sas_host->rphy_list, list) { if (rphy->identify.device_type != SAS_END_DEVICE || rphy->scsi_target_id == -1) continue; if ((channel == SCAN_WILD_CARD || channel == 0) && (id == SCAN_WILD_CARD || id == rphy->scsi_target_id)) { scsi_scan_target(&rphy->dev, 0, rphy->scsi_target_id, lun, SCSI_SCAN_MANUAL); } } mutex_unlock(&sas_host->lock); return 0; } /* * Setup / Teardown code */ #define SETUP_TEMPLATE(attrb, field, perm, test) \ i->private_##attrb[count] = dev_attr_##field; \ i->private_##attrb[count].attr.mode = perm; \ i->attrb[count] = &i->private_##attrb[count]; \ if (test) \ count++ #define SETUP_TEMPLATE_RW(attrb, field, perm, test, ro_test, ro_perm) \ i->private_##attrb[count] = dev_attr_##field; \ i->private_##attrb[count].attr.mode = perm; \ if (ro_test) { \ i->private_##attrb[count].attr.mode = ro_perm; \ i->private_##attrb[count].store = NULL; \ } \ i->attrb[count] = &i->private_##attrb[count]; \ if (test) \ count++ #define SETUP_RPORT_ATTRIBUTE(field) \ SETUP_TEMPLATE(rphy_attrs, field, S_IRUGO, 1) #define SETUP_OPTIONAL_RPORT_ATTRIBUTE(field, func) \ SETUP_TEMPLATE(rphy_attrs, field, S_IRUGO, i->f->func) #define SETUP_PHY_ATTRIBUTE(field) \ SETUP_TEMPLATE(phy_attrs, field, S_IRUGO, 1) #define SETUP_PHY_ATTRIBUTE_RW(field) \ SETUP_TEMPLATE_RW(phy_attrs, field, S_IRUGO | S_IWUSR, 1, \ !i->f->set_phy_speed, S_IRUGO) #define SETUP_OPTIONAL_PHY_ATTRIBUTE_RW(field, func) \ SETUP_TEMPLATE_RW(phy_attrs, field, S_IRUGO | S_IWUSR, 1, \ !i->f->func, S_IRUGO) #define SETUP_PORT_ATTRIBUTE(field) \ SETUP_TEMPLATE(port_attrs, field, S_IRUGO, 1) #define SETUP_OPTIONAL_PHY_ATTRIBUTE(field, func) \ SETUP_TEMPLATE(phy_attrs, field, S_IRUGO, i->f->func) #define SETUP_PHY_ATTRIBUTE_WRONLY(field) \ SETUP_TEMPLATE(phy_attrs, field, S_IWUSR, 1) #define SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(field, func) \ SETUP_TEMPLATE(phy_attrs, field, S_IWUSR, i->f->func) #define SETUP_END_DEV_ATTRIBUTE(field) \ SETUP_TEMPLATE(end_dev_attrs, field, S_IRUGO, 1) #define SETUP_EXPANDER_ATTRIBUTE(field) \ SETUP_TEMPLATE(expander_attrs, expander_##field, S_IRUGO, 1) /** * sas_attach_transport - instantiate SAS transport template * @ft: SAS transport class function template */ struct scsi_transport_template * sas_attach_transport(struct sas_function_template *ft) { struct sas_internal *i; int count; i = kzalloc(sizeof(struct sas_internal), GFP_KERNEL); if (!i) return NULL; i->t.user_scan = sas_user_scan; i->t.host_attrs.ac.attrs = &i->host_attrs[0]; i->t.host_attrs.ac.class = &sas_host_class.class; i->t.host_attrs.ac.match = sas_host_match; transport_container_register(&i->t.host_attrs); i->t.host_size = sizeof(struct sas_host_attrs); i->phy_attr_cont.ac.class = &sas_phy_class.class; i->phy_attr_cont.ac.attrs = &i->phy_attrs[0]; i->phy_attr_cont.ac.match = sas_phy_match; transport_container_register(&i->phy_attr_cont); i->port_attr_cont.ac.class = &sas_port_class.class; i->port_attr_cont.ac.attrs = &i->port_attrs[0]; i->port_attr_cont.ac.match = sas_port_match; transport_container_register(&i->port_attr_cont); i->rphy_attr_cont.ac.class = &sas_rphy_class.class; i->rphy_attr_cont.ac.attrs = &i->rphy_attrs[0]; i->rphy_attr_cont.ac.match = sas_rphy_match; transport_container_register(&i->rphy_attr_cont); i->end_dev_attr_cont.ac.class = &sas_end_dev_class.class; i->end_dev_attr_cont.ac.attrs = &i->end_dev_attrs[0]; i->end_dev_attr_cont.ac.match = sas_end_dev_match; transport_container_register(&i->end_dev_attr_cont); i->expander_attr_cont.ac.class = &sas_expander_class.class; i->expander_attr_cont.ac.attrs = &i->expander_attrs[0]; i->expander_attr_cont.ac.match = sas_expander_match; transport_container_register(&i->expander_attr_cont); i->f = ft; count = 0; SETUP_PHY_ATTRIBUTE(initiator_port_protocols); SETUP_PHY_ATTRIBUTE(target_port_protocols); SETUP_PHY_ATTRIBUTE(device_type); SETUP_PHY_ATTRIBUTE(sas_address); SETUP_PHY_ATTRIBUTE(phy_identifier); SETUP_PHY_ATTRIBUTE(negotiated_linkrate); SETUP_PHY_ATTRIBUTE(minimum_linkrate_hw); SETUP_PHY_ATTRIBUTE_RW(minimum_linkrate); SETUP_PHY_ATTRIBUTE(maximum_linkrate_hw); SETUP_PHY_ATTRIBUTE_RW(maximum_linkrate); SETUP_PHY_ATTRIBUTE(invalid_dword_count); SETUP_PHY_ATTRIBUTE(running_disparity_error_count); SETUP_PHY_ATTRIBUTE(loss_of_dword_sync_count); SETUP_PHY_ATTRIBUTE(phy_reset_problem_count); SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(link_reset, phy_reset); SETUP_OPTIONAL_PHY_ATTRIBUTE_WRONLY(hard_reset, phy_reset); SETUP_OPTIONAL_PHY_ATTRIBUTE_RW(enable, phy_enable); i->phy_attrs[count] = NULL; count = 0; SETUP_PORT_ATTRIBUTE(num_phys); i->port_attrs[count] = NULL; count = 0; SETUP_RPORT_ATTRIBUTE(rphy_initiator_port_protocols); SETUP_RPORT_ATTRIBUTE(rphy_target_port_protocols); SETUP_RPORT_ATTRIBUTE(rphy_device_type); SETUP_RPORT_ATTRIBUTE(rphy_sas_address); SETUP_RPORT_ATTRIBUTE(rphy_phy_identifier); SETUP_RPORT_ATTRIBUTE(rphy_scsi_target_id); SETUP_OPTIONAL_RPORT_ATTRIBUTE(rphy_enclosure_identifier, get_enclosure_identifier); SETUP_OPTIONAL_RPORT_ATTRIBUTE(rphy_bay_identifier, get_bay_identifier); i->rphy_attrs[count] = NULL; count = 0; SETUP_END_DEV_ATTRIBUTE(end_dev_ready_led_meaning); SETUP_END_DEV_ATTRIBUTE(end_dev_I_T_nexus_loss_timeout); SETUP_END_DEV_ATTRIBUTE(end_dev_initiator_response_timeout); SETUP_END_DEV_ATTRIBUTE(end_dev_tlr_supported); SETUP_END_DEV_ATTRIBUTE(end_dev_tlr_enabled); i->end_dev_attrs[count] = NULL; count = 0; SETUP_EXPANDER_ATTRIBUTE(vendor_id); SETUP_EXPANDER_ATTRIBUTE(product_id); SETUP_EXPANDER_ATTRIBUTE(product_rev); SETUP_EXPANDER_ATTRIBUTE(component_vendor_id); SETUP_EXPANDER_ATTRIBUTE(component_id); SETUP_EXPANDER_ATTRIBUTE(component_revision_id); SETUP_EXPANDER_ATTRIBUTE(level); i->expander_attrs[count] = NULL; return &i->t; } EXPORT_SYMBOL(sas_attach_transport); /** * sas_release_transport - release SAS transport template instance * @t: transport template instance */ void sas_release_transport(struct scsi_transport_template *t) { struct sas_internal *i = to_sas_internal(t); transport_container_unregister(&i->t.host_attrs); transport_container_unregister(&i->phy_attr_cont); transport_container_unregister(&i->port_attr_cont); transport_container_unregister(&i->rphy_attr_cont); transport_container_unregister(&i->end_dev_attr_cont); transport_container_unregister(&i->expander_attr_cont); kfree(i); } EXPORT_SYMBOL(sas_release_transport); static __init int sas_transport_init(void) { int error; error = transport_class_register(&sas_host_class); if (error) goto out; error = transport_class_register(&sas_phy_class); if (error) goto out_unregister_transport; error = transport_class_register(&sas_port_class); if (error) goto out_unregister_phy; error = transport_class_register(&sas_rphy_class); if (error) goto out_unregister_port; error = transport_class_register(&sas_end_dev_class); if (error) goto out_unregister_rphy; error = transport_class_register(&sas_expander_class); if (error) goto out_unregister_end_dev; return 0; out_unregister_end_dev: transport_class_unregister(&sas_end_dev_class); out_unregister_rphy: transport_class_unregister(&sas_rphy_class); out_unregister_port: transport_class_unregister(&sas_port_class); out_unregister_phy: transport_class_unregister(&sas_phy_class); out_unregister_transport: transport_class_unregister(&sas_host_class); out: return error; } static void __exit sas_transport_exit(void) { transport_class_unregister(&sas_host_class); transport_class_unregister(&sas_phy_class); transport_class_unregister(&sas_port_class); transport_class_unregister(&sas_rphy_class); transport_class_unregister(&sas_end_dev_class); transport_class_unregister(&sas_expander_class); } MODULE_AUTHOR("Christoph Hellwig"); MODULE_DESCRIPTION("SAS Transport Attributes"); MODULE_LICENSE("GPL"); module_init(sas_transport_init); module_exit(sas_transport_exit);
2001 2080 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 // SPDX-License-Identifier: GPL-2.0 /* * All the USB notify logic * * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de> * * notifier functions originally based on those in kernel/sys.c * but fixed up to not be so broken. * * Released under the GPLv2 only. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/usb.h> #include <linux/mutex.h> #include "usb.h" static BLOCKING_NOTIFIER_HEAD(usb_notifier_list); /** * usb_register_notify - register a notifier callback whenever a usb change happens * @nb: pointer to the notifier block for the callback events. * * These changes are either USB devices or busses being added or removed. */ void usb_register_notify(struct notifier_block *nb) { blocking_notifier_chain_register(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_register_notify); /** * usb_unregister_notify - unregister a notifier callback * @nb: pointer to the notifier block for the callback events. * * usb_register_notify() must have been previously called for this function * to work properly. */ void usb_unregister_notify(struct notifier_block *nb) { blocking_notifier_chain_unregister(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_unregister_notify); void usb_notify_add_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev); } void usb_notify_remove_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev); } void usb_notify_add_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus); } void usb_notify_remove_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus); }
5 5 4 3 1 2 1 5 4 2 2 1 1 4 6 1 5 2 3 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003-2006 Helsinki University of Technology * Copyright (C)2003-2006 USAGI/WIDE Project */ /* * Authors: * Noriaki TAKAMIYA @USAGI * Masahide NAKAMURA @USAGI */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/time.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <net/rawv6.h> #include <net/xfrm.h> #include <net/mip6.h> static inline unsigned int calc_padlen(unsigned int len, unsigned int n) { return (n - len + 16) & 0x7; } static inline void *mip6_padn(__u8 *data, __u8 padlen) { if (!data) return NULL; if (padlen == 1) { data[0] = IPV6_TLV_PAD1; } else if (padlen > 1) { data[0] = IPV6_TLV_PADN; data[1] = padlen - 2; if (padlen > 2) memset(data+2, 0, data[1]); } return data + padlen; } static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos) { icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); } static int mip6_mh_len(int type) { int len = 0; switch (type) { case IP6_MH_TYPE_BRR: len = 0; break; case IP6_MH_TYPE_HOTI: case IP6_MH_TYPE_COTI: case IP6_MH_TYPE_BU: case IP6_MH_TYPE_BACK: len = 1; break; case IP6_MH_TYPE_HOT: case IP6_MH_TYPE_COT: case IP6_MH_TYPE_BERROR: len = 2; break; } return len; } static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) { struct ip6_mh _hdr; const struct ip6_mh *mh; mh = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_hdr), &_hdr); if (!mh) return -1; if (((mh->ip6mh_hdrlen + 1) << 3) > skb->len) return -1; if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { net_dbg_ratelimited("mip6: MH message too short: %d vs >=%d\n", mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_hdrlen) + skb_network_header_len(skb)); return -1; } if (mh->ip6mh_proto != IPPROTO_NONE) { net_dbg_ratelimited("mip6: MH invalid payload proto = %d\n", mh->ip6mh_proto); mip6_param_prob(skb, 0, offsetof(struct ip6_mh, ip6mh_proto) + skb_network_header_len(skb)); return -1; } return 0; } struct mip6_report_rate_limiter { spinlock_t lock; ktime_t stamp; int iif; struct in6_addr src; struct in6_addr dst; }; static struct mip6_report_rate_limiter mip6_report_rl = { .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock) }; static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; int err = destopt->nexthdr; spin_lock(&x->lock); if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); return err; } /* Destination Option Header is inserted. * IP Header's src address is replaced with Home Address Option in * Destination Option Header. */ static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *iph; struct ipv6_destopt_hdr *dstopt; struct ipv6_destopt_hao *hao; u8 nexthdr; int len; skb_push(skb, -skb_network_offset(skb)); iph = ipv6_hdr(skb); nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_DSTOPTS; dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb); dstopt->nexthdr = nexthdr; hao = mip6_padn((char *)(dstopt + 1), calc_padlen(sizeof(*dstopt), 6)); hao->type = IPV6_TLV_HAO; BUILD_BUG_ON(sizeof(*hao) != 18); hao->length = sizeof(*hao) - 2; len = ((char *)hao - (char *)dstopt) + sizeof(*hao); memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr)); spin_lock_bh(&x->lock); memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr)); spin_unlock_bh(&x->lock); WARN_ON(len != x->props.header_len); dstopt->hdrlen = (x->props.header_len >> 3) - 1; return 0; } static inline int mip6_report_rl_allow(ktime_t stamp, const struct in6_addr *dst, const struct in6_addr *src, int iif) { int allow = 0; spin_lock_bh(&mip6_report_rl.lock); if (mip6_report_rl.stamp != stamp || mip6_report_rl.iif != iif || !ipv6_addr_equal(&mip6_report_rl.src, src) || !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { mip6_report_rl.stamp = stamp; mip6_report_rl.iif = iif; mip6_report_rl.src = *src; mip6_report_rl.dst = *dst; allow = 1; } spin_unlock_bh(&mip6_report_rl.lock); return allow; } static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, const struct flowi *fl) { struct net *net = xs_net(x); struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; const struct flowi6 *fl6 = &fl->u.ip6; struct ipv6_destopt_hao *hao = NULL; struct xfrm_selector sel; int offset; ktime_t stamp; int err = 0; if (unlikely(fl6->flowi6_proto == IPPROTO_MH && fl6->fl6_mh_type <= IP6_MH_TYPE_MAX)) goto out; if (likely(opt->dsthao)) { offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(offset >= 0)) hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + offset); } stamp = skb_get_ktime(skb); if (!mip6_report_rl_allow(stamp, &ipv6_hdr(skb)->daddr, hao ? &hao->addr : &ipv6_hdr(skb)->saddr, opt->iif)) goto out; memset(&sel, 0, sizeof(sel)); memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, sizeof(sel.daddr)); sel.prefixlen_d = 128; memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, sizeof(sel.saddr)); sel.prefixlen_s = 128; sel.family = AF_INET6; sel.proto = fl6->flowi6_proto; sel.dport = xfrm_flowi_dport(fl, &fl6->uli); if (sel.dport) sel.dport_mask = htons(~0); sel.sport = xfrm_flowi_sport(fl, &fl6->uli); if (sel.sport) sel.sport_mask = htons(~0); sel.ifindex = fl6->flowi6_oif; err = km_report(net, IPPROTO_DSTOPTS, &sel, (hao ? (xfrm_address_t *)&hao->addr : NULL)); out: return err; } static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } x->props.header_len = sizeof(struct ipv6_destopt_hdr) + calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) + sizeof(struct ipv6_destopt_hao); WARN_ON(x->props.header_len != 24); return 0; } /* * Do nothing about destroying since it has no specific operation for * destination options header unlike IPsec protocols. */ static void mip6_destopt_destroy(struct xfrm_state *x) { } static const struct xfrm_type mip6_destopt_type = { .owner = THIS_MODULE, .proto = IPPROTO_DSTOPTS, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, .init_state = mip6_destopt_init_state, .destructor = mip6_destopt_destroy, .input = mip6_destopt_input, .output = mip6_destopt_output, .reject = mip6_destopt_reject, }; static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; int err = rt2->rt_hdr.nexthdr; spin_lock(&x->lock); if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); return err; } /* Routing Header type 2 is inserted. * IP Header's dst address is replaced with Routing Header's Home Address. */ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *iph; struct rt2_hdr *rt2; u8 nexthdr; skb_push(skb, -skb_network_offset(skb)); iph = ipv6_hdr(skb); nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_ROUTING; rt2 = (struct rt2_hdr *)skb_transport_header(skb); rt2->rt_hdr.nexthdr = nexthdr; rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1; rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2; rt2->rt_hdr.segments_left = 1; memset(&rt2->reserved, 0, sizeof(rt2->reserved)); WARN_ON(rt2->rt_hdr.hdrlen != 2); memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr)); spin_lock_bh(&x->lock); memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr)); spin_unlock_bh(&x->lock); return 0; } static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } x->props.header_len = sizeof(struct rt2_hdr); return 0; } /* * Do nothing about destroying since it has no specific operation for routing * header type 2 unlike IPsec protocols. */ static void mip6_rthdr_destroy(struct xfrm_state *x) { } static const struct xfrm_type mip6_rthdr_type = { .owner = THIS_MODULE, .proto = IPPROTO_ROUTING, .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, .init_state = mip6_rthdr_init_state, .destructor = mip6_rthdr_destroy, .input = mip6_rthdr_input, .output = mip6_rthdr_output, }; static int __init mip6_init(void) { pr_info("Mobile IPv6\n"); if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type(destopt)\n", __func__); goto mip6_destopt_xfrm_fail; } if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type(rthdr)\n", __func__); goto mip6_rthdr_xfrm_fail; } if (rawv6_mh_filter_register(mip6_mh_filter) < 0) { pr_info("%s: can't add rawv6 mh filter\n", __func__); goto mip6_rawv6_mh_fail; } return 0; mip6_rawv6_mh_fail: xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); mip6_rthdr_xfrm_fail: xfrm_unregister_type(&mip6_destopt_type, AF_INET6); mip6_destopt_xfrm_fail: return -EAGAIN; } static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) pr_info("%s: can't remove rawv6 mh filter\n", __func__); xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); xfrm_unregister_type(&mip6_destopt_type, AF_INET6); } module_init(mip6_init); module_exit(mip6_fini); MODULE_DESCRIPTION("IPv6 Mobility driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING);
2 2 2 4 9 10 10 7 7 7 7 7 9 2 8 6 6 6 6 5 5 4 4 5 10 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * dir.c */ /* * This file implements code to read directories from disk. * * See namei.c for a description of directory organisation on disk. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" static const unsigned char squashfs_filetype_table[] = { DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK }; /* * Lookup offset (f_pos) in the directory index, returning the * metadata block containing it. * * If we get an error reading the index then return the part of the index * (if any) we have managed to read - the index isn't essential, just * quicker. */ static int get_dir_index_using_offset(struct super_block *sb, u64 *next_block, int *next_offset, u64 index_start, int index_offset, int i_count, u64 f_pos) { struct squashfs_sb_info *msblk = sb->s_fs_info; int err, i, index, length = 0; unsigned int size; struct squashfs_dir_index dir_index; TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", i_count, f_pos); /* * Translate from external f_pos to the internal f_pos. This * is offset by 3 because we invent "." and ".." entries which are * not actually stored in the directory. */ if (f_pos <= 3) return f_pos; f_pos -= 3; for (i = 0; i < i_count; i++) { err = squashfs_read_metadata(sb, &dir_index, &index_start, &index_offset, sizeof(dir_index)); if (err < 0) break; index = le32_to_cpu(dir_index.index); if (index > f_pos) /* * Found the index we're looking for. */ break; size = le32_to_cpu(dir_index.size) + 1; /* size should never be larger than SQUASHFS_NAME_LEN */ if (size > SQUASHFS_NAME_LEN) break; err = squashfs_read_metadata(sb, NULL, &index_start, &index_offset, size); if (err < 0) break; length = index; *next_block = le32_to_cpu(dir_index.start_block) + msblk->directory_table; } *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; /* * Translate back from internal f_pos to external f_pos. */ return length + 3; } static int squashfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; u64 block = squashfs_i(inode)->start + msblk->directory_table; int offset = squashfs_i(inode)->offset, length, err; unsigned int inode_number, dir_count, size, type; struct squashfs_dir_header dirh; struct squashfs_dir_entry *dire; TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset); dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); if (dire == NULL) { ERROR("Failed to allocate squashfs_dir_entry\n"); goto finish; } /* * Return "." and ".." entries as the first two filenames in the * directory. To maximise compression these two entries are not * stored in the directory, and so we invent them here. * * It also means that the external f_pos is offset by 3 from the * on-disk directory f_pos. */ while (ctx->pos < 3) { char *name; int i_ino; if (ctx->pos == 0) { name = "."; size = 1; i_ino = inode->i_ino; } else { name = ".."; size = 2; i_ino = squashfs_i(inode)->parent; } if (!dir_emit(ctx, name, size, i_ino, squashfs_filetype_table[1])) goto finish; ctx->pos += size; } length = get_dir_index_using_offset(inode->i_sb, &block, &offset, squashfs_i(inode)->dir_idx_start, squashfs_i(inode)->dir_idx_offset, squashfs_i(inode)->dir_idx_cnt, ctx->pos); while (length < i_size_read(inode)) { /* * Read directory header */ err = squashfs_read_metadata(inode->i_sb, &dirh, &block, &offset, sizeof(dirh)); if (err < 0) goto failed_read; length += sizeof(dirh); dir_count = le32_to_cpu(dirh.count) + 1; if (dir_count > SQUASHFS_DIR_COUNT) goto failed_read; while (dir_count--) { /* * Read directory entry. */ err = squashfs_read_metadata(inode->i_sb, dire, &block, &offset, sizeof(*dire)); if (err < 0) goto failed_read; size = le16_to_cpu(dire->size) + 1; /* size should never be larger than SQUASHFS_NAME_LEN */ if (size > SQUASHFS_NAME_LEN) goto failed_read; err = squashfs_read_metadata(inode->i_sb, dire->name, &block, &offset, size); if (err < 0) goto failed_read; length += sizeof(*dire) + size; if (ctx->pos >= length) continue; dire->name[size] = '\0'; inode_number = le32_to_cpu(dirh.inode_number) + ((short) le16_to_cpu(dire->inode_number)); type = le16_to_cpu(dire->type); if (type > SQUASHFS_MAX_DIR_TYPE) goto failed_read; if (!dir_emit(ctx, dire->name, size, inode_number, squashfs_filetype_table[type])) goto finish; ctx->pos = length; } } finish: kfree(dire); return 0; failed_read: ERROR("Unable to read directory block [%llx:%x]\n", block, offset); kfree(dire); return 0; } const struct file_operations squashfs_dir_ops = { .read = generic_read_dir, .iterate_shared = squashfs_readdir, .llseek = generic_file_llseek, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SEGMENT_H #define _ASM_X86_SEGMENT_H #include <linux/const.h> #include <asm/alternative.h> #include <asm/ibt.h> /* * Constructor for a conventional segment GDT (or LDT) entry. * This is a macro so it can be used in initializers. */ #define GDT_ENTRY(flags, base, limit) \ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \ (((base) & _AC(0x00ffffff,ULL)) << 16) | \ (((limit) & _AC(0x0000ffff,ULL)))) /* Simple and small GDT entries for booting only: */ #define GDT_ENTRY_BOOT_CS 2 #define GDT_ENTRY_BOOT_DS 3 #define GDT_ENTRY_BOOT_TSS 4 #define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) #define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) /* * Bottom two bits of selector give the ring * privilege level */ #define SEGMENT_RPL_MASK 0x3 /* * When running on Xen PV, the actual privilege level of the kernel is 1, * not 0. Testing the Requested Privilege Level in a segment selector to * determine whether the context is user mode or kernel mode with * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level * matches the 0x3 mask. * * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV * kernels because privilege level 2 is never used. */ #define USER_SEGMENT_RPL_MASK 0x2 /* User mode is privilege level 3: */ #define USER_RPL 0x3 /* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ #define SEGMENT_TI_MASK 0x4 /* LDT segment has TI set ... */ #define SEGMENT_LDT 0x4 /* ... GDT has it cleared */ #define SEGMENT_GDT 0x0 #define GDT_ENTRY_INVALID_SEG 0 #if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64) /* * The layout of the per-CPU GDT under Linux: * * 0 - null <=== cacheline #1 * 1 - reserved * 2 - reserved * 3 - reserved * * 4 - unused <=== cacheline #2 * 5 - unused * * ------- start of TLS (Thread-Local Storage) segments: * * 6 - TLS segment #1 [ glibc's TLS segment ] * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] * 8 - TLS segment #3 <=== cacheline #3 * 9 - reserved * 10 - reserved * 11 - reserved * * ------- start of kernel segments: * * 12 - kernel code segment <=== cacheline #4 * 13 - kernel data segment * 14 - default user CS * 15 - default user DS * 16 - TSS <=== cacheline #5 * 17 - LDT * 18 - PNPBIOS support (16->32 gate) * 19 - PNPBIOS support * 20 - PNPBIOS support <=== cacheline #6 * 21 - PNPBIOS support * 22 - PNPBIOS support * 23 - APM BIOS support * 24 - APM BIOS support <=== cacheline #7 * 25 - APM BIOS support * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] * 28 - VDSO getcpu * 29 - unused * 30 - unused * 31 - TSS for double fault handler */ #define GDT_ENTRY_TLS_MIN 6 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) #define GDT_ENTRY_KERNEL_CS 12 #define GDT_ENTRY_KERNEL_DS 13 #define GDT_ENTRY_DEFAULT_USER_CS 14 #define GDT_ENTRY_DEFAULT_USER_DS 15 #define GDT_ENTRY_TSS 16 #define GDT_ENTRY_LDT 17 #define GDT_ENTRY_PNPBIOS_CS32 18 #define GDT_ENTRY_PNPBIOS_CS16 19 #define GDT_ENTRY_PNPBIOS_DS 20 #define GDT_ENTRY_PNPBIOS_TS1 21 #define GDT_ENTRY_PNPBIOS_TS2 22 #define GDT_ENTRY_APMBIOS_BASE 23 #define GDT_ENTRY_ESPFIX_SS 26 #define GDT_ENTRY_PERCPU 27 #define GDT_ENTRY_CPUNODE 28 #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* * Number of entries in the GDT table: */ #define GDT_ENTRIES 32 /* * Segment selector values corresponding to the above entries: */ #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __USER32_CS __USER_CS #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) /* segment for calling fn: */ #define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) /* code segment for BIOS: */ #define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) /* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) /* data segment for BIOS: */ #define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) /* transfer data segment: */ #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) /* another data segment: */ #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) #ifdef CONFIG_SMP # define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else # define __KERNEL_PERCPU 0 #endif #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #else /* 64-bit: */ #include <asm/cache.h> #define GDT_ENTRY_KERNEL32_CS 1 #define GDT_ENTRY_KERNEL_CS 2 #define GDT_ENTRY_KERNEL_DS 3 /* * We cannot use the same code segment descriptor for user and kernel mode, * not even in long flat mode, because of different DPL. * * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes * selectors: * * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, * * ss = STAR.SYSRET_CS+8 (in either case) * * thus USER_DS should be between 32-bit and 64-bit code selectors: */ #define GDT_ENTRY_DEFAULT_USER32_CS 4 #define GDT_ENTRY_DEFAULT_USER_DS 5 #define GDT_ENTRY_DEFAULT_USER_CS 6 /* Needs two entries */ #define GDT_ENTRY_TSS 8 /* Needs two entries */ #define GDT_ENTRY_LDT 10 #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 #define GDT_ENTRY_CPUNODE 15 /* * Number of entries in the GDT table: */ #define GDT_ENTRIES 16 /* * Segment selector values corresponding to the above entries: * * Note, selectors also need to have a correct RPL, * expressed with the +3 value for user-space selectors: */ #define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #endif #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack: */ #define EXCEPTION_ERRCODE_MASK 0x20027d00 #define GDT_SIZE (GDT_ENTRIES*8) #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) /* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ #define VDSO_CPUNODE_BITS 12 #define VDSO_CPUNODE_MASK 0xfff #ifndef __ASSEMBLY__ /* Helper functions to store/load CPU and node numbers */ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) { return (node << VDSO_CPUNODE_BITS) | cpu; } static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { unsigned int p; /* * Load CPU and node number from the GDT. LSL is faster than RDTSCP * and works on all CPUs. This is volatile so that it orders * correctly with respect to barrier() and to keep GCC from cleverly * hoisting it out of the calling function. * * If RDPID is available, use it. */ alternative_io ("lsl %[seg],%[p]", ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ X86_FEATURE_RDPID, [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) *cpu = (p & VDSO_CPUNODE_MASK); if (node) *node = (p >> VDSO_CPUNODE_BITS); } #endif /* !__ASSEMBLY__ */ #ifdef __KERNEL__ /* * early_idt_handler_array is an array of entry points referenced in the * early IDT. For simplicity, it's a real array with one entry point * every nine bytes. That leaves room for an optional 'push $0' if the * vector has no error code (two bytes), a 'push $vector_number' (two * bytes), and a jump to the common entry code (up to five bytes). */ #define EARLY_IDT_HANDLER_SIZE (9 + ENDBR_INSN_SIZE) /* * xen_early_idt_handler_array is for Xen pv guests: for each entry in * early_idt_handler_array it contains a prequel in the form of * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to * max 8 bytes. */ #define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE) #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void); #ifdef CONFIG_XEN_PV extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE]; #endif /* * Load a segment. Fall back on loading the zero segment if something goes * wrong. This variant assumes that loading zero fully clears the segment. * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any * failure to fully clear the cached descriptor is only observable for * FS and GS. */ #define __loadsegment_simple(seg, value) \ do { \ unsigned short __val = (value); \ \ asm volatile(" \n" \ "1: movl %k0,%%" #seg " \n" \ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\ : "+r" (__val) : : "memory"); \ } while (0) #define __loadsegment_ss(value) __loadsegment_simple(ss, (value)) #define __loadsegment_ds(value) __loadsegment_simple(ds, (value)) #define __loadsegment_es(value) __loadsegment_simple(es, (value)) #ifdef CONFIG_X86_32 /* * On 32-bit systems, the hidden parts of FS and GS are unobservable if * the selector is NULL, so there's no funny business here. */ #define __loadsegment_fs(value) __loadsegment_simple(fs, (value)) #define __loadsegment_gs(value) __loadsegment_simple(gs, (value)) #else static inline void __loadsegment_fs(unsigned short value) { asm volatile(" \n" "1: movw %0, %%fs \n" "2: \n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS) : : "rm" (value) : "memory"); } /* __loadsegment_gs is intentionally undefined. Use load_gs_index instead. */ #endif #define loadsegment(seg, value) __loadsegment_ ## seg (value) /* * Save a segment register away: */ #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */
371 176 573 14 27 11 12 27 774 775 90 1 676 6 148 14 2 176 53 40 2 163 13 41 757 24 758 10 216 7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt.h: declarations for per-file encryption * * Filesystems that implement per-file encryption must include this header * file. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/slab.h> #include <uapi/linux/fscrypt.h> /* * The lengths of all file contents blocks must be divisible by this value. * This is needed to ensure that all contents encryption modes will work, as * some of the supported modes don't support arbitrarily byte-aligned messages. * * Since the needed alignment is 16 bytes, most filesystems will meet this * requirement naturally, as typical block sizes are powers of 2. However, if a * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via * compression), then it will need to pad to this alignment before encryption. */ #define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_inode_info; struct fs_parameter; struct seq_file; struct fscrypt_str { unsigned char *name; u32 len; }; struct fscrypt_name { const struct qstr *usr_fname; struct fscrypt_str disk_name; u32 hash; u32 minor_hash; struct fscrypt_str crypto_buf; bool is_nokey_name; }; #define FSTR_INIT(n, l) { .name = n, .len = l } #define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) /* Maximum value for the third parameter of fscrypt_operations.set_context(). */ #define FSCRYPT_SET_CONTEXT_MAX_SIZE 40 #ifdef CONFIG_FS_ENCRYPTION /* Crypto operations for filesystems */ struct fscrypt_operations { /* * If set, then fs/crypto/ will allocate a global bounce page pool the * first time an encryption key is set up for a file. The bounce page * pool is required by the following functions: * * - fscrypt_encrypt_pagecache_blocks() * - fscrypt_zeroout_range() for files not using inline crypto * * If the filesystem doesn't use those, it doesn't need to set this. */ unsigned int needs_bounce_pages : 1; /* * If set, then fs/crypto/ will allow the use of encryption settings * that assume inode numbers fit in 32 bits (i.e. * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other * prerequisites for these settings are also met. This is only useful * if the filesystem wants to support inline encryption hardware that is * limited to 32-bit or 64-bit data unit numbers and where programming * keyslots is very slow. */ unsigned int has_32bit_inodes : 1; /* * If set, then fs/crypto/ will allow users to select a crypto data unit * size that is less than the filesystem block size. This is done via * the log2_data_unit_size field of the fscrypt policy. This flag is * not compatible with filesystems that encrypt variable-length blocks * (i.e. blocks that aren't all equal to filesystem's block size), for * example as a result of compression. It's also not compatible with * the fscrypt_encrypt_block_inplace() and * fscrypt_decrypt_block_inplace() functions. */ unsigned int supports_subblock_data_units : 1; /* * This field exists only for backwards compatibility reasons and should * only be set by the filesystems that are setting it already. It * contains the filesystem-specific key description prefix that is * accepted for "logon" keys for v1 fscrypt policies. This * functionality is deprecated in favor of the generic prefix * "fscrypt:", which itself is deprecated in favor of the filesystem * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY. Filesystems that * are newly adding fscrypt support should not set this field. */ const char *legacy_key_prefix; /* * Get the fscrypt context of the given inode. * * @inode: the inode whose context to get * @ctx: the buffer into which to get the context * @len: length of the @ctx buffer in bytes * * Return: On success, returns the length of the context in bytes; this * may be less than @len. On failure, returns -ENODATA if the * inode doesn't have a context, -ERANGE if the context is * longer than @len, or another -errno code. */ int (*get_context)(struct inode *inode, void *ctx, size_t len); /* * Set an fscrypt context on the given inode. * * @inode: the inode whose context to set. The inode won't already have * an fscrypt context. * @ctx: the context to set * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE) * @fs_data: If called from fscrypt_set_context(), this will be the * value the filesystem passed to fscrypt_set_context(). * Otherwise (i.e. when called from * FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL. * * i_rwsem will be held for write. * * Return: 0 on success, -errno on failure. */ int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); /* * Get the dummy fscrypt policy in use on the filesystem (if any). * * Filesystems only need to implement this function if they support the * test_dummy_encryption mount option. * * Return: A pointer to the dummy fscrypt policy, if the filesystem is * mounted with test_dummy_encryption; otherwise NULL. */ const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb); /* * Check whether a directory is empty. i_rwsem will be held for write. */ bool (*empty_dir)(struct inode *inode); /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations * such as filesystem shrinking and therefore can be used in the * encryption without the possibility of files becoming unreadable. * * Filesystems only need to implement this function if they want to * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags. These * flags are designed to work around the limitations of UFS and eMMC * inline crypto hardware, and they shouldn't be used in scenarios where * such hardware isn't being used. * * Leaving this NULL is equivalent to always returning false. */ bool (*has_stable_inodes)(struct super_block *sb); /* * Return an array of pointers to the block devices to which the * filesystem may write encrypted file contents, NULL if the filesystem * only has a single such block device, or an ERR_PTR() on error. * * On successful non-NULL return, *num_devs is set to the number of * devices in the returned array. The caller must free the returned * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ struct block_device **(*get_devices)(struct super_block *sb, unsigned int *num_devs); }; int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). * I.e., another task may publish ->i_crypt_info concurrently, executing * a RELEASE barrier. We need to use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_crypt_info); } /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. * * If you need to know whether the encrypt bit is set even when the kernel was * built without fscrypt support, you must use IS_ENCRYPTED() directly instead. */ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } /* * When d_splice_alias() moves a directory's no-key alias to its * plaintext alias as a result of the encryption key being added, * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity * to disable d_revalidate. Note that we don't have to support the * inverse operation because fscrypt doesn't allow no-key names to be * the source or target of a rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { /* * VFS calls fscrypt_handle_d_move even for non-fscrypt * filesystems. */ if (dentry->d_flags & DCACHE_NOKEY_NAME) { dentry->d_flags &= ~DCACHE_NOKEY_NAME; /* * Other filesystem features might be handling dentry * revalidation, in which case it cannot be disabled. */ if (dentry->d_op->d_revalidate == fscrypt_d_revalidate) dentry->d_flags &= ~DCACHE_OP_REVALIDATE; } } /** * fscrypt_is_nokey_name() - test whether a dentry is a no-key name * @dentry: the dentry to check * * This returns true if the dentry is a no-key dentry. A no-key dentry is a * dentry that was created in an encrypted directory that hasn't had its * encryption key added yet. Such dentries may be either positive or negative. * * When a filesystem is asked to create a new filename in an encrypted directory * and the new filename's dentry is a no-key dentry, it must fail the operation * with ENOKEY. This includes ->create(), ->mkdir(), ->mknod(), ->symlink(), * ->rename(), and ->link(). (However, ->rename() and ->link() are already * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().) * * This is necessary because creating a filename requires the directory's * encryption key, but just checking for the key on the directory inode during * the final filesystem operation doesn't guarantee that the key was available * during the preceding dentry lookup. And the key must have already been * available during the dentry lookup in order for it to have been checked * whether the filename already exists in the directory and for the new file's * dentry not to be invalidated due to it incorrectly having the no-key flag. * * Return: %true if the dentry is a no-key name */ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return dentry->d_flags & DCACHE_NOKEY_NAME; } static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { /* * This code tries to only take ->d_lock when necessary to write * to ->d_flags. We shouldn't be peeking on d_flags for * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case * there is a race, the worst it can happen is that we fail to * unset DCACHE_OP_REVALIDATE and pay the cost of an extra * d_revalidate. */ if (is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); } else if (dentry->d_flags & DCACHE_OP_REVALIDATE && dentry->d_op->d_revalidate == fscrypt_d_revalidate) { /* * Unencrypted dentries and encrypted dentries where the * key is available are always valid from fscrypt * perspective. Avoid the cost of calling * fscrypt_d_revalidate unnecessarily. */ spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_OP_REVALIDATE; spin_unlock(&dentry->d_lock); } } /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { return page->mapping == NULL; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { return (struct page *)page_private(bounce_page); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return folio->mapping == NULL; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { return bounce_folio->private; } void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy); bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return dummy_policy->policy != NULL; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { kfree(dummy_policy->policy); dummy_policy->policy = NULL; } /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret); void fscrypt_put_encryption_info(struct inode *inode); void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str); void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname); bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_readdir(struct inode *dir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags); int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done); int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { sb->s_cop = s_cop; } #else /* !CONFIG_FS_ENCRYPTION */ static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { return NULL; } static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return false; } static inline void fscrypt_handle_d_move(struct dentry *dentry) { } static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return false; } static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { } /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { return -EOPNOTSUPP; } static inline bool fscrypt_is_bounce_page(struct page *page) { return false; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return false; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline void fscrypt_free_bounce_page(struct page *bounce_page) { } /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { return 0; } static inline int fscrypt_set_context(struct inode *inode, void *fs_data) { return -EOPNOTSUPP; } struct fscrypt_dummy_policy { }; static inline int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { return -EINVAL; } static inline bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { return true; } static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return false; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { } /* keyring.c */ static inline void fscrypt_destroy_keyring(struct super_block *sb) { } static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } /* keysetup.c */ static inline int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; return 0; } static inline void fscrypt_put_encryption_info(struct inode *inode) { return; } static inline void fscrypt_free_inode(struct inode *inode) { } static inline int fscrypt_drop_inode(struct inode *inode) { return 0; } /* fname.c */ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(*fname)); fname->usr_fname = iname; fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } static inline void fscrypt_free_filename(struct fscrypt_name *fname) { return; } static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; } static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { return; } static inline int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { return -EOPNOTSUPP; } static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { /* Encryption support disabled; use standard comparison */ if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); } static inline u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { WARN_ON_ONCE(1); return 0; } static inline int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { return 1; } /* bio.c */ static inline bool fscrypt_decrypt_bio(struct bio *bio) { return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; } /* hooks.c */ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) { if (IS_ENCRYPTED(inode)) return -EOPNOTSUPP; return 0; } static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_readdir(struct inode *dir) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { return 0; } static inline int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } static inline int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { return -EOPNOTSUPP; } static inline const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { return -EOPNOTSUPP; } static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { } #endif /* !CONFIG_FS_ENCRYPTION */ /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return false; } static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } static inline void fscrypt_set_bio_crypt_ctx_bh( struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { return true; } static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh) { return true; } static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); } static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { return nr_blocks; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /** * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the block layer via blk-crypto rather * than in the filesystem layer. */ static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && __fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the filesystem layer rather than in the * block layer via blk-crypto. */ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && !__fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_has_encryption_key() - check whether an inode has had its key set up * @inode: the inode to check * * Return: %true if the inode has had its encryption key set up, else %false. * * Usually this should be preceded by fscrypt_get_encryption_info() to try to * set up the key first. */ static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return fscrypt_get_inode_info(inode) != NULL; } /** * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename * * A new link can only be added to an encrypted directory if the directory's * encryption key is available --- since otherwise we'd have no way to encrypt * the filename. * * We also verify that the link will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, * -EXDEV if the link would result in an inconsistent encryption policy, or * another -errno code. */ static inline int fscrypt_prepare_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry); return 0; } /** * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory * @new_dentry: dentry for target location (may be negative unless exchanging) * @flags: rename flags (we care at least about %RENAME_EXCHANGE) * * Prepare for ->rename() where the source and/or target directories may be * encrypted. A new link can only be added to an encrypted directory if the * directory's encryption key is available --- since otherwise we'd have no way * to encrypt the filename. A rename to an existing name, on the other hand, * *is* cryptographically possible without the key. However, we take the more * conservative approach and just forbid all no-key renames. * * We also verify that the rename will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the * rename would cause inconsistent encryption policies, or another -errno code. */ static inline int fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) return __fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); return 0; } /** * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory * * Prepare for ->lookup() in a directory which may be encrypted by determining * the name that will actually be used to search the directory on-disk. If the * directory's encryption policy is supported by this kernel and its encryption * key is available, then the lookup is assumed to be by plaintext name; * otherwise, it is assumed to be by no-key name. * * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key * name. In this case the filesystem must assign the dentry a dentry_operations * which contains fscrypt_d_revalidate (or contains a d_revalidate method that * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the * directory's encryption key is later added. * * Return: 0 on success; -ENOENT if the directory's key is unavailable but the * filename isn't a valid no-key name, so a negative dentry should be created; * or another -errno code. */ static inline int fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_lookup(dir, dentry, fname); memset(fname, 0, sizeof(*fname)); fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; fscrypt_prepare_dentry(dentry, false); return 0; } /** * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory * @dir: the directory inode * * If the directory is encrypted and it doesn't already have its encryption key * set up, try to set it up so that the filenames will be listed in plaintext * form rather than in no-key form. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ static inline int fscrypt_prepare_readdir(struct inode *dir) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_readdir(dir); return 0; } /** * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, * most attribute changes are allowed even without the encryption key. However, * without the encryption key we do have to forbid truncates. This is needed * because the size being truncated to may not be a multiple of the filesystem * block size, and in that case we'd have to decrypt the final block, zero the * portion past i_size, and re-encrypt it. (We *could* allow truncating to a * filesystem block boundary, but it's simpler to just forbid all truncates --- * and we already forbid all other contents modifications without the key.) * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_ENCRYPTED(d_inode(dentry))) return __fscrypt_prepare_setattr(dentry, attr); return 0; } /** * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator * @disk_link: (in/out) the on-disk symlink target being prepared * * If the symlink target needs to be encrypted, then this function encrypts it * into @disk_link->name. fscrypt_prepare_symlink() must have been called * previously to compute @disk_link->len. If the filesystem did not allocate a * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one * will be kmalloc()'ed and the filesystem will be responsible for freeing it. * * Return: 0 on success, -errno on failure */ static inline int fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(inode)) return __fscrypt_encrypt_symlink(inode, target, len, disk_link); return 0; } /* If *pagep is a bounce page, free it and set *pagep to the pagecache page */ static inline void fscrypt_finalize_bounce_page(struct page **pagep) { struct page *page = *pagep; if (fscrypt_is_bounce_page(page)) { *pagep = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(page); } } #endif /* _LINUX_FSCRYPT_H */
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H #include <linux/ceph/ceph_debug.h> #include <linux/ceph/osd_client.h> #include <linux/unaligned.h> #include <linux/backing-dev.h> #include <linux/completion.h> #include <linux/exportfs.h> #include <linux/fs.h> #include <linux/mempool.h> #include <linux/pagemap.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/posix_acl.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/netfs.h> #include <linux/fscache.h> #include <linux/hashtable.h> #include <linux/ceph/libceph.h> #include "crypto.h" /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ #define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ CEPH_MOUNT_OPT_NOCOPYFROM | \ CEPH_MOUNT_OPT_ASYNC_DIROPS) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt #define ceph_clear_mount_opt(fsc, opt) \ (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) /* max size of osd read request, limited by libceph */ #define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN /* osd has a configurable limitaion of max write size. * CEPH_MSG_MAX_DATA_LEN should be small enough. */ #define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" /* * Delay telling the MDS we no longer want caps, in case we reopen * the file. Delay a minimum amount of time, even if we send a cap * message for some other reason. Otherwise, take the oppotunity to * update the mds to avoid sending another message later. */ #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ struct ceph_mount_options { unsigned int flags; unsigned int wsize; /* max write size */ unsigned int rsize; /* max read size */ unsigned int rasize; /* max readahead */ unsigned int congestion_kb; /* max writeback in flight */ unsigned int caps_wanted_delay_min, caps_wanted_delay_max; int caps_max; unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ bool new_dev_syntax; /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() */ char *snapdir_name; /* default ".snap" */ char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ char *mon_addr; struct fscrypt_dummy_policy dummy_enc_policy; }; /* mount state */ enum { CEPH_MOUNT_MOUNTING, CEPH_MOUNT_MOUNTED, CEPH_MOUNT_UNMOUNTING, CEPH_MOUNT_UNMOUNTED, CEPH_MOUNT_SHUTDOWN, CEPH_MOUNT_RECOVER, CEPH_MOUNT_FENCE_IO, }; #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 struct ceph_fs_client { struct super_block *sb; struct list_head metric_wakeup; struct ceph_mount_options *mount_options; struct ceph_client *client; int mount_state; bool blocklisted; bool have_copy_from2; u32 filp_gen; loff_t max_file_size; struct ceph_mds_client *mdsc; atomic_long_t writeback_count; bool write_congested; struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); spinlock_t async_unlink_conflict_lock; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; struct dentry *debugfs_status; struct dentry *debugfs_mds_sessions; struct dentry *debugfs_metrics_dir; #endif #ifdef CONFIG_CEPH_FSCACHE struct fscache_volume *fscache; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_dummy_policy fsc_dummy_enc_policy; #endif }; /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read * and write data. For any given inode, we should have one or more * capabilities, one issued by each metadata server, and our * cumulative access is the OR of all issued capabilities. * * Each cap is referenced by the inode's i_caps rbtree and by per-mds * session capability lists. */ struct ceph_cap { struct ceph_inode_info *ci; struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */ u64 cap_id; /* unique cap id (mds provided) */ union { /* in-use caps */ struct { int issued; /* latest, from the mds */ int implemented; /* implemented superset of issued (for revocation) */ int mds; /* mds index for this cap */ int mds_wanted; /* caps wanted from this mds */ }; /* caps to release */ struct { u64 cap_ino; int queue_release; }; }; u32 seq, issue_seq, mseq; u32 cap_gen; /* active/stale cycle */ unsigned long last_used; struct list_head caps_item; }; #define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ #define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ #define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ #define CHECK_CAPS_FLUSH_FORCE 8 /* force flush any caps */ struct ceph_cap_flush { u64 tid; int caps; bool wake; /* wake up flush waiters when finish ? */ bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode }; /* * Snapped cap state that is pending flush to mds. When a snapshot occurs, * we first complete any in-process sync writes and writeback any dirty * data before flushing the snapped state (tracked here) back to the MDS. */ struct ceph_cap_snap { refcount_t nref; struct list_head ci_item; struct ceph_cap_flush cap_flush; u64 follows; int issued, dirty; struct ceph_snap_context *context; umode_t mode; kuid_t uid; kgid_t gid; struct ceph_buffer *xattr_blob; u64 xattr_version; u64 size; u64 change_attr; struct timespec64 mtime, atime, ctime, btime; u64 time_warp_seq; u64 truncate_size; u32 truncate_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; bool need_flush; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); kmem_cache_free(ceph_cap_snap_cachep, capsnap); } } /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where * metadata authority is delegated, and whether/where metadata is replicated. * * A _leaf_ frag will be present in the i_fragtree IFF there is * delegation info. That is, if mds >= 0 || ndist > 0. */ #define CEPH_MAX_DIRFRAG_REP 4 struct ceph_inode_frag { struct rb_node node; /* fragtree state */ u32 frag; int split_by; /* i.e. 2^(split_by) children */ /* delegation and replication info */ int mds; /* -1 if same authority as parent */ int ndist; /* >0 if replicated */ int dist[CEPH_MAX_DIRFRAG_REP]; }; /* * We cache inode xattrs as an encoded blob until they are first used, * at which point we parse them into an rbtree. */ struct ceph_inode_xattr { struct rb_node node; const char *name; int name_len; const char *val; int val_len; int dirty; int should_free_name; int should_free_val; }; /* * Ceph dentry state */ struct ceph_dentry_info { struct dentry *dentry; struct ceph_mds_session *lease_session; struct list_head lease_list; struct hlist_node hnode; unsigned long flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; unsigned long lease_renew_after, lease_renew_from; unsigned long time; u64 offset; }; #define CEPH_DENTRY_REFERENCED (1 << 0) #define CEPH_DENTRY_LEASE_LIST (1 << 1) #define CEPH_DENTRY_SHRINK_LIST (1 << 2) #define CEPH_DENTRY_PRIMARY_LINK (1 << 3) #define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) #define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) #define CEPH_DENTRY_ASYNC_CREATE_BIT (5) #define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing * this until someone actually calls getxattr, etc. * * blob->vec.iov_len == 4 implies there are no xattrs; blob == * NULL means we don't know. */ struct ceph_buffer *blob, *prealloc_blob; struct rb_root index; bool dirty; int count; int names_size; int vals_size; u64 version, index_version; }; /* * Ceph inode. */ struct ceph_inode_info { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct ceph_vino i_vino; /* ceph ino + snap */ spinlock_t i_ceph_lock; u64 i_version; u64 i_inline_version; u32 i_time_warp_seq; unsigned long i_ceph_flags; atomic64_t i_release_count; atomic64_t i_ordered_count; atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; struct ceph_file_layout i_cached_layout; // for async creates char *i_symlink; /* for dirs */ struct timespec64 i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps; u64 i_files, i_subdirs; /* quotas */ u64 i_max_bytes, i_max_files; s32 i_dir_pin; struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; /* capabilities. protected _both_ by i_ceph_lock and cap->session's * s_mutex. */ struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ /* * Link to the auth cap's session's s_cap_dirty list. s_cap_dirty * is protected by the mdsc->cap_dirty_lock, but each individual item * is also protected by the inode's i_ceph_lock. Walking s_cap_dirty * requires the mdsc->cap_dirty_lock. List presence for an item can * be tested under the i_ceph_lock. Changing anything requires both. */ struct list_head i_dirty_item; /* * Link to session's s_cap_flushing list. Protected in a similar * fashion to i_dirty_item, but also by the s_mutex for changes. The * s_cap_flushing list can be walked while holding either the s_mutex * or msdc->cap_dirty_lock. List presence can also be checked while * holding the i_ceph_lock for this inode. */ struct list_head i_flushing_item; /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ struct ceph_cap_flush *i_prealloc_cap_flush; struct list_head i_cap_flush_list; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ unsigned long i_last_rd; unsigned long i_last_wr; int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ /* * For none fscrypt case it equals to i_truncate_size or it will * equals to fscrypt_file_size */ u64 i_truncate_pagecache_size; u64 i_max_size; /* max file size authorized by mds */ u64 i_reported_size; /* (max_)size reported to or requested of mds */ u64 i_wanted_max_size; /* offset we'd like to write too */ u64 i_requested_max_size; /* max_size we've requested */ /* held references to caps */ int i_pin_ref; int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; atomic_t i_filelock_ref; atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; union { struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ }; struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; struct timespec64 i_btime; struct timespec64 i_snap_btime; struct work_struct i_work; unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; u8 *fscrypt_file; #endif }; struct ceph_netfs_request_data { int caps; /* * Maximum size of a file readahead request. * The fadvise could update the bdi's default ra_pages. */ unsigned int file_ra_pages; /* Set it if fadvise disables file readahead entirely */ bool file_ra_disabled; }; static inline struct ceph_inode_info * ceph_inode(const struct inode *inode) { return container_of(inode, struct ceph_inode_info, netfs.inode); } static inline struct ceph_fs_client * ceph_inode_to_fs_client(const struct inode *inode) { return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } static inline struct ceph_fs_client * ceph_sb_to_fs_client(const struct super_block *sb) { return (struct ceph_fs_client *)sb->s_fs_info; } static inline struct ceph_mds_client * ceph_sb_to_mdsc(const struct super_block *sb) { return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc; } static inline struct ceph_client * ceph_inode_to_client(const struct inode *inode) { return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client; } static inline struct ceph_vino ceph_vino(const struct inode *inode) { return ceph_inode(inode)->i_vino; } static inline u32 ceph_ino_to_ino32(u64 vino) { u32 ino = vino & 0xffffffff; ino ^= vino >> 32; if (!ino) ino = 2; return ino; } /* * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on * some arches. We generally do not use this value inside the ceph driver, but * we do want to set it to something, so that generic vfs code has an * appropriate value for tracepoints and the like. */ static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino) { if (sizeof(ino_t) == sizeof(u32)) return ceph_ino_to_ino32(vino.ino); return (ino_t)vino.ino; } /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap static inline u64 ceph_ino(struct inode *inode) { return ceph_inode(inode)->i_vino.ino; } static inline u64 ceph_snap(struct inode *inode) { return ceph_inode(inode)->i_vino.snap; } /** * ceph_present_ino - format an inode number for presentation to userland * @sb: superblock where the inode lives * @ino: inode number to (possibly) convert * * If the user mounted with the ino32 option, then the 64-bit value needs * to be converted to something that can fit inside 32 bits. Note that * internal kernel code never uses this value, so this is entirely for * userland consumption. */ static inline u64 ceph_present_ino(struct super_block *sb, u64 ino) { if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32))) return ceph_ino_to_ino32(ino); return ino; } static inline u64 ceph_present_inode(struct inode *inode) { return ceph_present_ino(inode->i_sb, ceph_ino(inode)); } static inline int ceph_ino_compare(struct inode *inode, void *data) { struct ceph_vino *pvino = (struct ceph_vino *)data; struct ceph_inode_info *ci = ceph_inode(inode); return ci->i_vino.ino == pvino->ino && ci->i_vino.snap == pvino->snap; } /* * The MDS reserves a set of inodes for its own usage. These should never * be accessible by clients, and so the MDS has no reason to ever hand these * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. * * These come from src/mds/mdstypes.h in the ceph sources. */ #define CEPH_MAX_MDS 0x100 #define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) #define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { if (vino.ino >= CEPH_INO_SYSTEM_BASE || vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) return false; /* Don't warn on mdsdirs */ WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, "Attempt to access reserved inode number 0x%llx", vino.ino); return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { if (ceph_vino_is_reserved(vino)) return NULL; /* * NB: The hashval will be run through the fs/inode.c hash function * anyway, so there is no need to squash the inode number down to * 32-bits first. Just use low-order bits on arches with 32-bit long. */ return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino); } /* * Ceph inode. */ #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ #define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ #define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ #define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async creating finishes */ /* * Masks of ceph inode work. */ #define CEPH_I_WORK_WRITEBACK 0 #define CEPH_I_WORK_INVALIDATE_PAGES 1 #define CEPH_I_WORK_VMTRUNCATE 2 #define CEPH_I_WORK_CHECK_CAPS 3 #define CEPH_I_WORK_FLUSH_SNAPS 4 /* * We set the ERROR_WRITE bit when we start seeing write errors on an inode * and then clear it when they start succeeding. Note that we do a lockless * check first, and only take the lock if it looks like it needs to be changed. * The write submission code just takes this as a hint, so we're not too * worried if a few slip through in either direction. */ static inline void ceph_set_error_write(struct ceph_inode_info *ci) { if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) { spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags |= CEPH_I_ERROR_WRITE; spin_unlock(&ci->i_ceph_lock); } } static inline void ceph_clear_error_write(struct ceph_inode_info *ci) { if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) { spin_lock(&ci->i_ceph_lock); ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE; spin_unlock(&ci->i_ceph_lock); } } static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, long long ordered_count) { /* * Makes sure operations that setup readdir cache (update page * cache and i_size) are strongly ordered w.r.t. the following * atomic64_set() operations. */ smp_mb(); atomic64_set(&ci->i_complete_seq[0], release_count); atomic64_set(&ci->i_complete_seq[1], ordered_count); } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { atomic64_inc(&ci->i_release_count); } static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) { atomic64_inc(&ci->i_ordered_count); } static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) { return atomic64_read(&ci->i_complete_seq[0]) == atomic64_read(&ci->i_release_count); } static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) { return atomic64_read(&ci->i_complete_seq[0]) == atomic64_read(&ci->i_release_count) && atomic64_read(&ci->i_complete_seq[1]) == atomic64_read(&ci->i_ordered_count); } static inline void ceph_dir_clear_complete(struct inode *inode) { __ceph_dir_clear_complete(ceph_inode(inode)); } static inline void ceph_dir_clear_ordered(struct inode *inode) { __ceph_dir_clear_ordered(ceph_inode(inode)); } static inline bool ceph_dir_is_complete_ordered(struct inode *inode) { bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); smp_rmb(); return ret; } /* find a specific frag @f */ extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f); /* * choose fragment for value @v. copy frag content to pfrag, if leaf * exists */ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; } /* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) { return !RB_EMPTY_ROOT(&ci->i_caps); } extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented); extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t); extern int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int t); extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *cap); static inline int ceph_caps_issued(struct ceph_inode_info *ci) { int issued; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); spin_unlock(&ci->i_ceph_lock); return issued; } static inline int ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { int r; spin_lock(&ci->i_ceph_lock); r = __ceph_caps_issued_mask_metric(ci, mask, touch); spin_unlock(&ci->i_ceph_lock); return r; } static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf); extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) { return ci->i_nr_by_mode[0]; } extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); extern int __ceph_caps_wanted(struct ceph_inode_info *ci); /* what the mds thinks we want */ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, struct ceph_mount_options *fsopt); extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); extern void change_auth_cap_ses(struct ceph_inode_info *ci, struct ceph_mds_session *session); /* * we keep buffered readdir results attached to file->private_data */ #define CEPH_F_SYNC 1 #define CEPH_F_ATEND 2 struct ceph_file_info { short fmode; /* initialized on open */ short flags; /* CEPH_F_* */ spinlock_t rw_contexts_lock; struct list_head rw_contexts; u32 filp_gen; }; struct ceph_dir_file_info { struct ceph_file_info file_info; /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; long long dir_ordered_count; int readdir_cache_idx; /* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len; }; struct ceph_rw_context { struct list_head list; struct task_struct *thread; int caps; }; #define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ struct ceph_rw_context _name = { \ .thread = current, \ .caps = _caps, \ } static inline void ceph_add_rw_context(struct ceph_file_info *cf, struct ceph_rw_context *ctx) { spin_lock(&cf->rw_contexts_lock); list_add(&ctx->list, &cf->rw_contexts); spin_unlock(&cf->rw_contexts_lock); } static inline void ceph_del_rw_context(struct ceph_file_info *cf, struct ceph_rw_context *ctx) { spin_lock(&cf->rw_contexts_lock); list_del(&ctx->list); spin_unlock(&cf->rw_contexts_lock); } static inline struct ceph_rw_context* ceph_find_rw_context(struct ceph_file_info *cf) { struct ceph_rw_context *ctx, *found = NULL; spin_lock(&cf->rw_contexts_lock); list_for_each_entry(ctx, &cf->rw_contexts, list) { if (ctx->thread == current) { found = ctx; break; } } spin_unlock(&cf->rw_contexts_lock); return found; } struct ceph_readdir_cache_control { struct page *page; struct dentry **dentries; int index; }; /* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves * are organized into a hierarchy, such that children inherit (some of) * the snapshots of their parents. * * All inodes within the realm that have capabilities are linked into a * per-realm list. */ struct ceph_snap_realm { u64 ino; struct inode *inode; atomic_t nref; struct rb_node node; u64 created, seq; u64 parent_ino; u64 parent_since; /* snapid when our current parent became so */ u64 *prior_parent_snaps; /* snaps inherited from any parents we */ u32 num_prior_parent_snaps; /* had prior to parent_since */ u64 *snaps; /* snaps specific to this realm */ u32 num_snaps; struct ceph_snap_realm *parent; struct list_head children; /* list of child realms */ struct list_head child_item; struct list_head empty_item; /* if i have ref==0 */ struct list_head dirty_item; /* if realm needs new context */ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; struct list_head inodes_with_caps; spinlock_t inodes_with_caps_lock; }; static inline int default_congestion_kb(void) { int congestion_kb; /* * Copied from NFS * * congestion size, scale with available memory. * * 64MB: 8192k * 128MB: 11585k * 256MB: 16384k * 512MB: 23170k * 1GB: 32768k * 2GB: 46340k * 4GB: 65536k * 8GB: 92681k * 16GB: 131072k * * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (congestion_kb > 256*1024) congestion_kb = 256*1024; return congestion_kb; } /* super.c */ extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern int ceph_update_snap_trace(struct ceph_mds_client *m, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret); void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap); extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, struct ceph_snapid_map *sm); extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); void ceph_umount_begin(struct super_block *sb); /* * a cap_snap is "pending" if it is still awaiting an in-progress * sync write (that may/may not still update size, mtime, etc.). */ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item)->writing; } /* inode.c */ struct ceph_mds_reply_info_in; struct ceph_mds_reply_dirfrag; struct ceph_acl_sec_ctx; extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_evict_inode(struct inode *inode); extern void ceph_free_inode(struct inode *inode); struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_as_ctx_to_req(struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as_ctx); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, struct inode *newino); extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, struct timespec64 *atime); extern int ceph_fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_info_in *iinfo, struct ceph_mds_reply_dirfrag *dirinfo, struct ceph_mds_session *session, int cap_fmode, struct ceph_cap_reservation *caps_reservation); extern int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session); extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); void ceph_queue_inode_work(struct inode *inode, int work_bit); static inline void ceph_queue_vmtruncate(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE); } static inline void ceph_queue_invalidate(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES); } static inline void ceph_queue_writeback(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK); } static inline void ceph_queue_check_caps(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS); } static inline void ceph_queue_flush_snaps(struct inode *inode) { ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); } extern int ceph_try_to_choose_auth_mds(struct inode *inode, int mask); extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) { return __ceph_do_getattr(inode, NULL, mask, force); } extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); struct ceph_iattr { struct ceph_fscrypt_auth *fscrypt_auth; }; extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, struct iattr *attr, struct ceph_iattr *cia); extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); void ceph_inode_shutdown(struct inode *inode); static inline bool ceph_inode_is_shutdown(struct inode *inode) { unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int state = READ_ONCE(fsc->mount_state); return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN; } /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern const struct xattr_handler * const ceph_xattr_handlers[]; struct ceph_acl_sec_ctx { #ifdef CONFIG_CEPH_FS_POSIX_ACL void *default_acl; void *acl; #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL void *sec_ctx; u32 sec_ctxlen; #endif #ifdef CONFIG_FS_ENCRYPTION struct ceph_fscrypt_auth *fscrypt_auth; #endif struct ceph_pagelist *pagelist; }; #ifdef CONFIG_SECURITY extern bool ceph_security_xattr_deadlock(struct inode *in); extern bool ceph_security_xattr_wanted(struct inode *in); #else static inline bool ceph_security_xattr_deadlock(struct inode *in) { return false; } static inline bool ceph_security_xattr_wanted(struct inode *in) { return false; } #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx); static inline void ceph_security_invalidate_secctx(struct inode *inode) { security_inode_invalidate_secctx(inode); } #else static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx) { return 0; } static inline void ceph_security_invalidate_secctx(struct inode *inode) { } #endif void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); /* acl.c */ #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int, bool); int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx); static inline void ceph_forget_all_cached_acls(struct inode *inode) { forget_all_cached_acls(inode); } #else #define ceph_get_acl NULL #define ceph_set_acl NULL static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) { return 0; } static inline void ceph_init_inode_acls(struct inode *inode, struct ceph_acl_sec_ctx *as_ctx) { } static inline void ceph_forget_all_cached_acls(struct inode *inode) { } #endif /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg); extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, unsigned issued, unsigned wanted, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, bool queue_release); extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci); extern struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc); extern void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags); extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, int want, loff_t endoff, int *got); extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); extern void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode); /* addr.c */ extern const struct address_space_operations ceph_aops; extern const struct netfs_request_ops ceph_netfs_ops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *file); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) { if (ci->i_inline_version == CEPH_INLINE_NONE || ci->i_inline_version == 1) /* initial version, no data */ return false; return true; } /* file.c */ extern const struct file_operations ceph_file_fops; extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode); extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, struct iov_iter *to, int *retry_op, u64 *last_objver); extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; extern const struct inode_operations ceph_dir_iops; extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di); extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern int ceph_trim_dentries(struct ceph_mds_client *mdsc); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); /* ioctl.c */ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino); /* locks.c */ extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); extern int ceph_encode_locks_to_buffer(struct inode *inode, struct ceph_filelock *flocks, int num_fcntl_locks, int num_flock_locks); extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks); /* debugfs.c */ extern void ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); /* quota.c */ enum quota_get_realm { QUOTA_GET_MAX_FILES, QUOTA_GET_MAX_BYTES, QUOTA_GET_ANY }; static inline bool __ceph_has_quota(struct ceph_inode_info *ci, enum quota_get_realm which) { bool has_quota = false; switch (which) { case QUOTA_GET_MAX_BYTES: has_quota = !!ci->i_max_bytes; break; case QUOTA_GET_MAX_FILES: has_quota = !!ci->i_max_files; break; default: has_quota = !!(ci->i_max_files || ci->i_max_bytes); } return has_quota; } extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); static inline void __ceph_update_quota(struct ceph_inode_info *ci, u64 max_bytes, u64 max_files) { bool had_quota, has_quota; had_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); ci->i_max_bytes = max_bytes; ci->i_max_files = max_files; has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY); if (had_quota != has_quota) ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota); } static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len) { int cnt = 0; if (IS_ENCRYPTED(inode)) { cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT; if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL) cnt = 0; } return cnt; } extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newlen); extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc); void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */
24 25 2024 2028 4 4 4 4 4 4 4 4 4 4 4 4 4 4 38 25 25 23 368 365 367 2795 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/path.h> #include <linux/slab.h> #include <linux/fs_struct.h> #include "internal.h" /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_root(struct fs_struct *fs, const struct path *path) { struct path old_root; path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_root = fs->root; fs->root = *path; write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_root.dentry) path_put(&old_root); } /* * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_pwd(struct fs_struct *fs, const struct path *path) { struct path old_pwd; path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_pwd.dentry) path_put(&old_pwd); } static inline int replace_path(struct path *p, const struct path *old, const struct path *new) { if (likely(p->dentry != old->dentry || p->mnt != old->mnt)) return 0; *p = *new; return 1; } void chroot_fs_refs(const struct path *old_root, const struct path *new_root) { struct task_struct *g, *p; struct fs_struct *fs; int count = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { task_lock(p); fs = p->fs; if (fs) { int hits = 0; spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); write_seqcount_end(&fs->seq); while (hits--) { count++; path_get(new_root); } spin_unlock(&fs->lock); } task_unlock(p); } read_unlock(&tasklist_lock); while (count--) path_put(old_root); } void free_fs_struct(struct fs_struct *fs) { path_put(&fs->root); path_put(&fs->pwd); kmem_cache_free(fs_cachep, fs); } void exit_fs(struct task_struct *tsk) { struct fs_struct *fs = tsk->fs; if (fs) { int kill; task_lock(tsk); spin_lock(&fs->lock); tsk->fs = NULL; kill = !--fs->users; spin_unlock(&fs->lock); task_unlock(tsk); if (kill) free_fs_struct(fs); } } struct fs_struct *copy_fs_struct(struct fs_struct *old) { struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don't need to lock fs - think why ;-) */ if (fs) { fs->users = 1; fs->in_exec = 0; spin_lock_init(&fs->lock); seqcount_spinlock_init(&fs->seq, &fs->lock); fs->umask = old->umask; spin_lock(&old->lock); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); spin_unlock(&old->lock); } return fs; } int unshare_fs_struct(void) { struct fs_struct *fs = current->fs; struct fs_struct *new_fs = copy_fs_struct(fs); int kill; if (!new_fs) return -ENOMEM; task_lock(current); spin_lock(&fs->lock); kill = !--fs->users; current->fs = new_fs; spin_unlock(&fs->lock); task_unlock(current); if (kill) free_fs_struct(fs); return 0; } EXPORT_SYMBOL_GPL(unshare_fs_struct); int current_umask(void) { return current->fs->umask; } EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), .umask = 0022, };
14 1 14 14 1 1 1 1 14 1 1 1 2 14 14 1 14 14 5 14 7 7 7 7 1 1 1 1 1 7 1 6 1 5 1 4 3 14 6 13 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for some logitech "special" devices * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby * Copyright (c) 2010 Hendrik Iben */ /* */ #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/usb.h> #include <linux/wait.h> #include "usbhid/usbhid.h" #include "hid-ids.h" #include "hid-lg.h" #include "hid-lg4ff.h" #define LG_RDESC 0x001 #define LG_BAD_RELATIVE_KEYS 0x002 #define LG_DUPLICATE_USAGES 0x004 #define LG_EXPANDED_KEYMAP 0x010 #define LG_IGNORE_DOUBLED_WHEEL 0x020 #define LG_WIRELESS 0x040 #define LG_INVERT_HWHEEL 0x080 #define LG_NOGET 0x100 #define LG_FF 0x200 #define LG_FF2 0x400 #define LG_RDESC_REL_ABS 0x800 #define LG_FF3 0x1000 #define LG_FF4 0x2000 /* Size of the original descriptors of the Driving Force (and Pro) wheels */ #define DF_RDESC_ORIG_SIZE 130 #define DFP_RDESC_ORIG_SIZE 97 #define FV_RDESC_ORIG_SIZE 130 #define MOMO_RDESC_ORIG_SIZE 87 #define MOMO2_RDESC_ORIG_SIZE 87 #define FFG_RDESC_ORIG_SIZE 85 #define FG_RDESC_ORIG_SIZE 82 /* Fixed report descriptors for Logitech Driving Force (and Pro) * wheel controllers * * The original descriptors hide the separate throttle and brake axes in * a custom vendor usage page, providing only a combined value as * GenericDesktop.Y. * These descriptors remove the combined Y axis and instead report * separate throttle (Y) and brake (RZ). */ static const __u8 df_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x14, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x34, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0C, /* Report Count (12), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage (Buttons), */ 0x19, 0x01, /* Usage Minimum (1), */ 0x29, 0x0c, /* Usage Maximum (12), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x02, /* Report Count (2), */ 0x06, 0x00, 0xFF, /* Usage Page (Vendor: 65280), */ 0x09, 0x01, /* Usage (?: 1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x81, 0x02, /* Input (Variable), */ 0x25, 0x07, /* Logical Maximum (7), */ 0x46, 0x3B, 0x01, /* Physical Maximum (315), */ 0x75, 0x04, /* Report Size (4), */ 0x65, 0x14, /* Unit (Degrees), */ 0x09, 0x39, /* Usage (Hat Switch), */ 0x81, 0x42, /* Input (Variable, Null State), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x65, 0x00, /* Unit (none), */ 0x06, 0x00, 0xFF, /* Usage Page (Vendor: 65280), */ 0x09, 0x01, /* Usage (?: 1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x35, /* Usage (Rz), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x07, /* Report Count (7), */ 0x75, 0x08, /* Report Size (8), */ 0x09, 0x03, /* Usage (?: 3), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const __u8 dfp_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0E, /* Report Size (14), */ 0x14, /* Logical Minimum (0), */ 0x26, 0xFF, 0x3F, /* Logical Maximum (16383), */ 0x34, /* Physical Minimum (0), */ 0x46, 0xFF, 0x3F, /* Physical Maximum (16383), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0E, /* Report Count (14), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x0E, /* Usage Maximum (0Eh), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x04, /* Report Size (4), */ 0x25, 0x07, /* Logical Maximum (7), */ 0x46, 0x3B, 0x01, /* Physical Maximum (315), */ 0x65, 0x14, /* Unit (Degrees), */ 0x09, 0x39, /* Usage (Hat Switch), */ 0x81, 0x42, /* Input (Variable, Nullstate), */ 0x65, 0x00, /* Unit, */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x75, 0x08, /* Report Size (8), */ 0x81, 0x01, /* Input (Constant), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x35, /* Usage (Rz), */ 0x81, 0x02, /* Input (Variable), */ 0x81, 0x01, /* Input (Constant), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const __u8 fv_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0C, /* Report Count (12), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x0C, /* Usage Maximum (0Ch), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x02, /* Report Count (2), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x02, /* Usage (02h), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x25, 0x07, /* Logical Maximum (7), */ 0x46, 0x3B, 0x01, /* Physical Maximum (315), */ 0x75, 0x04, /* Report Size (4), */ 0x65, 0x14, /* Unit (Degrees), */ 0x09, 0x39, /* Usage (Hat Switch), */ 0x81, 0x42, /* Input (Variable, Null State), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x65, 0x00, /* Unit, */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x95, 0x07, /* Report Count (7), */ 0x75, 0x08, /* Report Size (8), */ 0x09, 0x03, /* Usage (03h), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const __u8 momo_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x08, /* Report Count (8), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x08, /* Usage Maximum (08h), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x75, 0x0E, /* Report Size (14), */ 0x95, 0x01, /* Report Count (1), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x00, /* Usage (00h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x75, 0x08, /* Report Size (8), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const __u8 momo2_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystick), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x0A, /* Report Count (10), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x0A, /* Usage Maximum (0Ah), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x00, /* Usage (00h), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x00, /* Usage (00h), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const __u8 ffg_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystik), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x0A, /* Report Size (10), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x03, /* Logical Maximum (1023), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x03, /* Physical Maximum (1023), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x06, /* Report Count (6), */ 0x75, 0x01, /* Report Size (1), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x06, /* Usage Maximum (06h), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x01, /* Report Count (1), */ 0x75, 0x08, /* Report Size (8), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x81, 0x01, /* Input (Constant), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x09, 0x02, /* Usage (02h), */ 0x95, 0x07, /* Report Count (7), */ 0x91, 0x02, /* Output (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const __u8 fg_rdesc_fixed[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x04, /* Usage (Joystik), */ 0xA1, 0x01, /* Collection (Application), */ 0xA1, 0x02, /* Collection (Logical), */ 0x15, 0x00, /* Logical Minimum (0), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x35, 0x00, /* Physical Minimum (0), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x01, /* Report Count (1), */ 0x09, 0x30, /* Usage (X), */ 0x81, 0x02, /* Input (Variable), */ 0xA4, /* Push, */ 0x25, 0x01, /* Logical Maximum (1), */ 0x45, 0x01, /* Physical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x02, /* Report Count (2), */ 0x81, 0x01, /* Input (Constant), */ 0x95, 0x06, /* Report Count (6), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x06, /* Usage Maximum (06h), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0xB4, /* Pop, */ 0x81, 0x02, /* Input (Constant), */ 0x09, 0x31, /* Usage (Y), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x32, /* Usage (Z), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xA1, 0x02, /* Collection (Logical), */ 0x26, 0xFF, 0x00, /* Logical Maximum (255), */ 0x46, 0xFF, 0x00, /* Physical Maximum (255), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x04, /* Report Count (4), */ 0x09, 0x02, /* Usage (02h), */ 0xB1, 0x02, /* Feature (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection, */ }; /* * Certain Logitech keyboards send in report #3 keys which are far * above the logical maximum described in descriptor. This extends * the original value of 0x28c of logical maximum to 0x104d */ static const __u8 *lg_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if ((drv_data->quirks & LG_RDESC) && *rsize >= 91 && rdesc[83] == 0x26 && rdesc[84] == 0x8c && rdesc[85] == 0x02) { hid_info(hdev, "fixing up Logitech keyboard report descriptor\n"); rdesc[84] = rdesc[89] = 0x4d; rdesc[85] = rdesc[90] = 0x10; } if ((drv_data->quirks & LG_RDESC_REL_ABS) && *rsize >= 51 && rdesc[32] == 0x81 && rdesc[33] == 0x06 && rdesc[49] == 0x81 && rdesc[50] == 0x06) { hid_info(hdev, "fixing up rel/abs in Logitech report descriptor\n"); rdesc[33] = rdesc[50] = 0x02; } switch (hdev->product) { case USB_DEVICE_ID_LOGITECH_WINGMAN_FG: if (*rsize == FG_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Wingman Formula GP report descriptor\n"); *rsize = sizeof(fg_rdesc_fixed); return fg_rdesc_fixed; } else { hid_info(hdev, "rdesc size test failed for formula gp\n"); } break; case USB_DEVICE_ID_LOGITECH_WINGMAN_FFG: if (*rsize == FFG_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Wingman Formula Force GP report descriptor\n"); *rsize = sizeof(ffg_rdesc_fixed); return ffg_rdesc_fixed; } break; /* Several wheels report as this id when operating in emulation mode. */ case USB_DEVICE_ID_LOGITECH_WHEEL: if (*rsize == DF_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Driving Force report descriptor\n"); *rsize = sizeof(df_rdesc_fixed); return df_rdesc_fixed; } break; case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL: if (*rsize == MOMO_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Momo Force (Red) report descriptor\n"); *rsize = sizeof(momo_rdesc_fixed); return momo_rdesc_fixed; } break; case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2: if (*rsize == MOMO2_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Momo Racing Force (Black) report descriptor\n"); *rsize = sizeof(momo2_rdesc_fixed); return momo2_rdesc_fixed; } break; case USB_DEVICE_ID_LOGITECH_VIBRATION_WHEEL: if (*rsize == FV_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Formula Vibration report descriptor\n"); *rsize = sizeof(fv_rdesc_fixed); return fv_rdesc_fixed; } break; case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: if (*rsize == DFP_RDESC_ORIG_SIZE) { hid_info(hdev, "fixing up Logitech Driving Force Pro report descriptor\n"); *rsize = sizeof(dfp_rdesc_fixed); return dfp_rdesc_fixed; } break; case USB_DEVICE_ID_LOGITECH_WII_WHEEL: if (*rsize >= 101 && rdesc[41] == 0x95 && rdesc[42] == 0x0B && rdesc[47] == 0x05 && rdesc[48] == 0x09) { hid_info(hdev, "fixing up Logitech Speed Force Wireless report descriptor\n"); rdesc[41] = 0x05; rdesc[42] = 0x09; rdesc[47] = 0x95; rdesc[48] = 0x0B; } break; } return rdesc; } #define lg_map_key_clear(c) hid_map_usage_clear(hi, usage, bit, max, \ EV_KEY, (c)) static int lg_ultrax_remote_mapping(struct hid_input *hi, struct hid_usage *usage, unsigned long **bit, int *max) { if ((usage->hid & HID_USAGE_PAGE) != HID_UP_LOGIVENDOR) return 0; set_bit(EV_REP, hi->input->evbit); switch (usage->hid & HID_USAGE) { /* Reported on Logitech Ultra X Media Remote */ case 0x004: lg_map_key_clear(KEY_AGAIN); break; case 0x00d: lg_map_key_clear(KEY_HOME); break; case 0x024: lg_map_key_clear(KEY_SHUFFLE); break; case 0x025: lg_map_key_clear(KEY_TV); break; case 0x026: lg_map_key_clear(KEY_MENU); break; case 0x031: lg_map_key_clear(KEY_AUDIO); break; case 0x032: lg_map_key_clear(KEY_TEXT); break; case 0x033: lg_map_key_clear(KEY_LAST); break; case 0x047: lg_map_key_clear(KEY_MP3); break; case 0x048: lg_map_key_clear(KEY_DVD); break; case 0x049: lg_map_key_clear(KEY_MEDIA); break; case 0x04a: lg_map_key_clear(KEY_VIDEO); break; case 0x04b: lg_map_key_clear(KEY_ANGLE); break; case 0x04c: lg_map_key_clear(KEY_LANGUAGE); break; case 0x04d: lg_map_key_clear(KEY_SUBTITLE); break; case 0x051: lg_map_key_clear(KEY_RED); break; case 0x052: lg_map_key_clear(KEY_CLOSE); break; default: return 0; } return 1; } static int lg_wireless_mapping(struct hid_input *hi, struct hid_usage *usage, unsigned long **bit, int *max) { if ((usage->hid & HID_USAGE_PAGE) != HID_UP_CONSUMER) return 0; switch (usage->hid & HID_USAGE) { case 0x1001: lg_map_key_clear(KEY_MESSENGER); break; case 0x1003: lg_map_key_clear(KEY_SOUND); break; case 0x1004: lg_map_key_clear(KEY_VIDEO); break; case 0x1005: lg_map_key_clear(KEY_AUDIO); break; case 0x100a: lg_map_key_clear(KEY_DOCUMENTS); break; /* The following two entries are Playlist 1 and 2 on the MX3200 */ case 0x100f: lg_map_key_clear(KEY_FN_1); break; case 0x1010: lg_map_key_clear(KEY_FN_2); break; case 0x1011: lg_map_key_clear(KEY_PREVIOUSSONG); break; case 0x1012: lg_map_key_clear(KEY_NEXTSONG); break; case 0x1013: lg_map_key_clear(KEY_CAMERA); break; case 0x1014: lg_map_key_clear(KEY_MESSENGER); break; case 0x1015: lg_map_key_clear(KEY_RECORD); break; case 0x1016: lg_map_key_clear(KEY_PLAYER); break; case 0x1017: lg_map_key_clear(KEY_EJECTCD); break; case 0x1018: lg_map_key_clear(KEY_MEDIA); break; case 0x1019: lg_map_key_clear(KEY_PROG1); break; case 0x101a: lg_map_key_clear(KEY_PROG2); break; case 0x101b: lg_map_key_clear(KEY_PROG3); break; case 0x101c: lg_map_key_clear(KEY_CYCLEWINDOWS); break; case 0x101f: lg_map_key_clear(KEY_ZOOMIN); break; case 0x1020: lg_map_key_clear(KEY_ZOOMOUT); break; case 0x1021: lg_map_key_clear(KEY_ZOOMRESET); break; case 0x1023: lg_map_key_clear(KEY_CLOSE); break; case 0x1027: lg_map_key_clear(KEY_MENU); break; /* this one is marked as 'Rotate' */ case 0x1028: lg_map_key_clear(KEY_ANGLE); break; case 0x1029: lg_map_key_clear(KEY_SHUFFLE); break; case 0x102a: lg_map_key_clear(KEY_BACK); break; case 0x102b: lg_map_key_clear(KEY_CYCLEWINDOWS); break; case 0x102d: lg_map_key_clear(KEY_WWW); break; /* The following two are 'Start/answer call' and 'End/reject call' on the MX3200 */ case 0x1031: lg_map_key_clear(KEY_OK); break; case 0x1032: lg_map_key_clear(KEY_CANCEL); break; case 0x1041: lg_map_key_clear(KEY_BATTERY); break; case 0x1042: lg_map_key_clear(KEY_WORDPROCESSOR); break; case 0x1043: lg_map_key_clear(KEY_SPREADSHEET); break; case 0x1044: lg_map_key_clear(KEY_PRESENTATION); break; case 0x1045: lg_map_key_clear(KEY_UNDO); break; case 0x1046: lg_map_key_clear(KEY_REDO); break; case 0x1047: lg_map_key_clear(KEY_PRINT); break; case 0x1048: lg_map_key_clear(KEY_SAVE); break; case 0x1049: lg_map_key_clear(KEY_PROG1); break; case 0x104a: lg_map_key_clear(KEY_PROG2); break; case 0x104b: lg_map_key_clear(KEY_PROG3); break; case 0x104c: lg_map_key_clear(KEY_PROG4); break; default: return 0; } return 1; } static int lg_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { /* extended mapping for certain Logitech hardware (Logitech cordless desktop LX500) */ static const u8 e_keymap[] = { 0,216, 0,213,175,156, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 0, 0,212, 174,167,152,161,112, 0, 0, 0,154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,183,184,185,186,187, 188,189,190,191,192,193,194, 0, 0, 0 }; struct lg_drv_data *drv_data = hid_get_drvdata(hdev); unsigned int hid = usage->hid; if (hdev->product == USB_DEVICE_ID_LOGITECH_RECEIVER && lg_ultrax_remote_mapping(hi, usage, bit, max)) return 1; if ((drv_data->quirks & LG_WIRELESS) && lg_wireless_mapping(hi, usage, bit, max)) return 1; if ((hid & HID_USAGE_PAGE) != HID_UP_BUTTON) return 0; hid &= HID_USAGE; /* Special handling for Logitech Cordless Desktop */ if (field->application == HID_GD_MOUSE) { if ((drv_data->quirks & LG_IGNORE_DOUBLED_WHEEL) && (hid == 7 || hid == 8)) return -1; } else { if ((drv_data->quirks & LG_EXPANDED_KEYMAP) && hid < ARRAY_SIZE(e_keymap) && e_keymap[hid] != 0) { hid_map_usage(hi, usage, bit, max, EV_KEY, e_keymap[hid]); return 1; } } return 0; } static int lg_input_mapped(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if ((drv_data->quirks & LG_BAD_RELATIVE_KEYS) && usage->type == EV_KEY && (field->flags & HID_MAIN_ITEM_RELATIVE)) field->flags &= ~HID_MAIN_ITEM_RELATIVE; if ((drv_data->quirks & LG_DUPLICATE_USAGES) && (usage->type == EV_KEY || usage->type == EV_REL || usage->type == EV_ABS)) clear_bit(usage->code, *bit); /* Ensure that Logitech wheels are not given a default fuzz/flat value */ if (usage->type == EV_ABS && (usage->code == ABS_X || usage->code == ABS_Y || usage->code == ABS_Z || usage->code == ABS_RZ)) { switch (hdev->product) { case USB_DEVICE_ID_LOGITECH_G29_WHEEL: case USB_DEVICE_ID_LOGITECH_WINGMAN_FG: case USB_DEVICE_ID_LOGITECH_WINGMAN_FFG: case USB_DEVICE_ID_LOGITECH_WHEEL: case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL: case USB_DEVICE_ID_LOGITECH_DFP_WHEEL: case USB_DEVICE_ID_LOGITECH_G25_WHEEL: case USB_DEVICE_ID_LOGITECH_DFGT_WHEEL: case USB_DEVICE_ID_LOGITECH_G27_WHEEL: case USB_DEVICE_ID_LOGITECH_WII_WHEEL: case USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2: case USB_DEVICE_ID_LOGITECH_VIBRATION_WHEEL: field->application = HID_GD_MULTIAXIS; break; default: break; } } return 0; } static int lg_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if ((drv_data->quirks & LG_INVERT_HWHEEL) && usage->code == REL_HWHEEL) { input_event(field->hidinput->input, usage->type, usage->code, -value); return 1; } if (drv_data->quirks & LG_FF4) { return lg4ff_adjust_input_event(hdev, field, usage, value, drv_data); } return 0; } static int lg_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *rd, int size) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if (drv_data->quirks & LG_FF4) return lg4ff_raw_event(hdev, report, rd, size, drv_data); return 0; } static int lg_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct usb_interface *iface; __u8 iface_num; unsigned int connect_mask = HID_CONNECT_DEFAULT; struct lg_drv_data *drv_data; int ret; if (!hid_is_usb(hdev)) return -EINVAL; iface = to_usb_interface(hdev->dev.parent); iface_num = iface->cur_altsetting->desc.bInterfaceNumber; /* G29 only work with the 1st interface */ if ((hdev->product == USB_DEVICE_ID_LOGITECH_G29_WHEEL) && (iface_num != 0)) { dbg_hid("%s: ignoring ifnum %d\n", __func__, iface_num); return -ENODEV; } drv_data = kzalloc(sizeof(struct lg_drv_data), GFP_KERNEL); if (!drv_data) { hid_err(hdev, "Insufficient memory, cannot allocate driver data\n"); return -ENOMEM; } drv_data->quirks = id->driver_data; hid_set_drvdata(hdev, (void *)drv_data); if (drv_data->quirks & LG_NOGET) hdev->quirks |= HID_QUIRK_NOGET; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err_free; } if (drv_data->quirks & (LG_FF | LG_FF2 | LG_FF3 | LG_FF4)) connect_mask &= ~HID_CONNECT_FF; ret = hid_hw_start(hdev, connect_mask); if (ret) { hid_err(hdev, "hw start failed\n"); goto err_free; } /* Setup wireless link with Logitech Wii wheel */ if (hdev->product == USB_DEVICE_ID_LOGITECH_WII_WHEEL) { static const unsigned char cbuf[] = { 0x00, 0xAF, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; u8 *buf = kmemdup(cbuf, sizeof(cbuf), GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto err_stop; } ret = hid_hw_raw_request(hdev, buf[0], buf, sizeof(cbuf), HID_FEATURE_REPORT, HID_REQ_SET_REPORT); if (ret >= 0) { /* insert a little delay of 10 jiffies ~ 40ms */ wait_queue_head_t wait; init_waitqueue_head (&wait); wait_event_interruptible_timeout(wait, 0, msecs_to_jiffies(40)); /* Select random Address */ buf[1] = 0xB2; get_random_bytes(&buf[2], 2); ret = hid_hw_raw_request(hdev, buf[0], buf, sizeof(cbuf), HID_FEATURE_REPORT, HID_REQ_SET_REPORT); } kfree(buf); } if (drv_data->quirks & LG_FF) ret = lgff_init(hdev); else if (drv_data->quirks & LG_FF2) ret = lg2ff_init(hdev); else if (drv_data->quirks & LG_FF3) ret = lg3ff_init(hdev); else if (drv_data->quirks & LG_FF4) ret = lg4ff_init(hdev); if (ret) goto err_stop; return 0; err_stop: hid_hw_stop(hdev); err_free: kfree(drv_data); return ret; } static void lg_remove(struct hid_device *hdev) { struct lg_drv_data *drv_data = hid_get_drvdata(hdev); if (drv_data->quirks & LG_FF4) lg4ff_deinit(hdev); hid_hw_stop(hdev); kfree(drv_data); } static const struct hid_device_id lg_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER), .driver_data = LG_RDESC | LG_WIRELESS }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RECEIVER), .driver_data = LG_BAD_RELATIVE_KEYS }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_DESKTOP), .driver_data = LG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_ELITE_KBD), .driver_data = LG_IGNORE_DOUBLED_WHEEL | LG_EXPANDED_KEYMAP }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_CORDLESS_DESKTOP_LX500), .driver_data = LG_IGNORE_DOUBLED_WHEEL | LG_EXPANDED_KEYMAP }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_EXTREME_3D), .driver_data = LG_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DUAL_ACTION), .driver_data = LG_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WHEEL), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD_CORD), .driver_data = LG_FF2 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2_2), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G29_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_F3D), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_FORCE3D_PRO), .driver_data = LG_FF }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOMO_WHEEL), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_MOMO_WHEEL2), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_VIBRATION_WHEEL), .driver_data = LG_FF2 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G25_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DFGT_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G27_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_DFP_WHEEL), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WII_WHEEL), .driver_data = LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_FG), .driver_data = LG_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_WINGMAN_FFG), .driver_data = LG_NOGET | LG_FF4 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_RUMBLEPAD2), .driver_data = LG_NOGET | LG_FF2 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_FLIGHT_SYSTEM_G940), .driver_data = LG_FF3 }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACENAVIGATOR), .driver_data = LG_RDESC_REL_ABS }, { HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_SPACETRAVELLER), .driver_data = LG_RDESC_REL_ABS }, { } }; MODULE_DEVICE_TABLE(hid, lg_devices); static struct hid_driver lg_driver = { .name = "logitech", .id_table = lg_devices, .report_fixup = lg_report_fixup, .input_mapping = lg_input_mapping, .input_mapped = lg_input_mapped, .event = lg_event, .raw_event = lg_raw_event, .probe = lg_probe, .remove = lg_remove, }; module_hid_driver(lg_driver); #ifdef CONFIG_LOGIWHEELS_FF int lg4ff_no_autoswitch = 0; module_param_named(lg4ff_no_autoswitch, lg4ff_no_autoswitch, int, S_IRUGO); MODULE_PARM_DESC(lg4ff_no_autoswitch, "Do not switch multimode wheels to their native mode automatically"); #endif MODULE_DESCRIPTION("HID driver for some logitech \"special\" devices"); MODULE_LICENSE("GPL");
16 16 767 328 599 10 582 121 135 1 591 586 129 130 454 10 38 604 15 21 604 320 44 24 605 580 5 10 564 10 6 722 26 2 26 26 4 25 380 614 605 588 714 720 724 720 715 724 8 164 165 157 16 37 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 #ifndef IOU_CORE_H #define IOU_CORE_H #include <linux/errno.h> #include <linux/lockdep.h> #include <linux/resume_user_mode.h> #include <linux/kasan.h> #include <linux/poll.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> #include "io-wq.h" #include "slist.h" #include "filetable.h" #ifndef CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #endif enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, /* * Requeue the task_work to restart operations on this request. The * actual value isn't important, should just be not an otherwise * valid error code, yet less than -MAX_ERRNO and valid internally. */ IOU_REQUEUE = -3072, /* * Intended only when both IO_URING_F_MULTISHOT is passed * to indicate to the poll runner that multishot should be * removed and the result is set on req->cqe.res. */ IOU_STOP_MULTISHOT = -ECANCELED, }; struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; unsigned cq_tail; unsigned cq_min_tail; unsigned nr_timeouts; int hit_timeout; ktime_t min_timeout; ktime_t timeout; struct hrtimer t; #ifdef CONFIG_NET_RX_BUSY_POLL ktime_t napi_busy_poll_dt; bool napi_prefer_busy_poll; #endif }; static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); void io_req_queue_iowq(struct io_kiocb *req); int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); void io_activate_pollwq(struct io_ring_ctx *ctx); static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { #if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); } else if (!ctx->task_complete) { lockdep_assert_held(&ctx->completion_lock); } else if (ctx->submitter_task) { /* * ->submitter_task may be NULL and we can still post a CQE, * if the ring has been setup with IORING_SETUP_R_DISABLED. * Not from an SQE, as those cannot be submitted, but via * updating tagged resources. */ if (ctx->submitter_task->flags & PF_EXITING) lockdep_assert(current_work()); else lockdep_assert(current == ctx->submitter_task); } #endif } static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, 0); } static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || ctx->submit_state.cq_flush) __io_submit_flush_completions(ctx); } #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, bool overflow) { io_lockdep_assert_cq_locked(ctx); if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { if (unlikely(!io_cqe_cache_refill(ctx, overflow))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; if (ctx->flags & IORING_SETUP_CQE32) ctx->cqe_cached++; return true; } static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) { return io_get_cqe_overflow(ctx, ret, false); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_uring_cqe *cqe; /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in * the ring. */ if (unlikely(!io_get_cqe(ctx, &cqe))) return false; if (trace_io_uring_complete_enabled()) trace_io_uring_complete(req->ctx, req, req->cqe.user_data, req->cqe.res, req->cqe.flags, req->big_cqe.extra1, req->big_cqe.extra2); memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } return true; } static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; if (req->flags & REQ_F_CQE_SKIP) { req->flags &= ~REQ_F_CQE_SKIP; req->flags |= REQ_F_SKIP_LINK_CQES; } } static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) { req->cqe.res = res; req->cqe.flags = cflags; } static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; } static inline void io_put_file(struct io_kiocb *req) { if (!(req->flags & REQ_F_FIXED_FILE) && req->file) fput(req->file); } static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) { lockdep_assert_held(&ctx->uring_lock); if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_unlock(&ctx->uring_lock); } static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) { /* * "Normal" inline submissions always hold the uring_lock, since we * grab it from the system call. Same is true for the SQPOLL offload. * The only exception is when we've detached the request and issue it * from an async worker thread, grab the lock for that case. */ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) mutex_lock(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock); } static inline void io_commit_cqring(struct io_ring_ctx *ctx) { /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) { if (wq_has_sleeper(&ctx->poll_wq)) __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * Trigger waitqueue handler on all waiters on our waitqueue. This * won't necessarily wake up all the tasks, io_should_wake() will make * that decision. * * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter * set in the mask so that if we recurse back into our own poll * waitqueue handlers, we know we have a dependency between eventfd or * epoll and should terminate multishot poll at that point. */ if (wq_has_sleeper(&ctx->cq_wait)) __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; unsigned int entries; /* make sure SQ entry isn't read before tail */ entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; return min(entries, ctx->sq_entries); } static inline int io_run_task_work(void) { bool ret = false; /* * Always check-and-clear the task_work notification signal. With how * signaling works for task_work, we can find it set with nothing to * run. We need to clear it for that case, like get_signal() does. */ if (test_thread_flag(TIF_NOTIFY_SIGNAL)) clear_notify_signal(); /* * PF_IO_WORKER never returns to userspace, so check here if we have * notify work that needs processing. */ if (current->flags & PF_IO_WORKER) { if (test_thread_flag(TIF_NOTIFY_RESUME)) { __set_current_state(TASK_RUNNING); resume_user_mode_work(NULL); } if (current->io_uring) { unsigned int count = 0; tctx_task_work_run(current->io_uring, UINT_MAX, &count); if (count) ret = true; } } if (task_work_pending(current)) { __set_current_state(TASK_RUNNING); task_work_run(); ret = true; } return ret; } static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { return task_work_pending(current) || !llist_empty(&ctx->work_llist); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { lockdep_assert_held(&ctx->uring_lock); } /* * Don't complete immediately but use deferred completion infrastructure. * Protected by ->uring_lock and can only be used either with * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex. */ static inline void io_req_complete_defer(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { struct io_submit_state *state = &req->ctx->submit_state; lockdep_assert_held(&req->ctx->uring_lock); wq_list_add_tail(&req->comp_list, &state->compl_reqs); } static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } static inline void io_get_task_refs(int nr) { struct io_uring_task *tctx = current->io_uring; tctx->cached_refs -= nr; if (unlikely(tctx->cached_refs < 0)) io_task_refs_refill(tctx); } static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) { return !ctx->submit_state.free_list.next; } extern struct kmem_cache *req_cachep; extern struct kmem_cache *io_buf_cachep; static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); wq_stack_extract(&ctx->submit_state.free_list); return req; } static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) { if (unlikely(io_req_cache_empty(ctx))) { if (!__io_alloc_req_refill(ctx)) return false; } *req = io_extract_req(ctx); return true; } static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) { return likely(ctx->submitter_task == current); } static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) { return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || ctx->submitter_task == current); } static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } /* * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each * slot. */ static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) { if (ctx->flags & IORING_SETUP_SQE128) return 2 * sizeof(struct io_uring_sqe); return sizeof(struct io_uring_sqe); } static inline bool io_file_can_poll(struct io_kiocb *req) { if (req->flags & REQ_F_CAN_POLL) return true; if (req->file && file_can_poll(req->file)) { req->flags |= REQ_F_CAN_POLL; return true; } return false; } static inline ktime_t io_get_time(struct io_ring_ctx *ctx) { if (ctx->clockid == CLOCK_MONOTONIC) return ktime_get(); return ktime_get_with_offset(ctx->clock_offset); } enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || !llist_empty(&ctx->work_llist); } #endif
930 8414 8432 203 929 930 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KERNEL_PRINTK__ #define __KERNEL_PRINTK__ #include <linux/stdarg.h> #include <linux/init.h> #include <linux/kern_levels.h> #include <linux/linkage.h> #include <linux/ratelimit_types.h> #include <linux/once_lite.h> struct console; extern const char linux_banner[]; extern const char linux_proc_banner[]; extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ #define PRINTK_MAX_SINGLE_HEADER_LEN 2 static inline int printk_get_level(const char *buffer) { if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { switch (buffer[1]) { case '0' ... '7': case 'c': /* KERN_CONT */ return buffer[1]; } } return 0; } static inline const char *printk_skip_level(const char *buffer) { if (printk_get_level(buffer)) return buffer + 2; return buffer; } static inline const char *printk_skip_headers(const char *buffer) { while (printk_get_level(buffer)) buffer = printk_skip_level(buffer); return buffer; } /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ #define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ #define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ #define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ /* * Default used to be hard-coded at 7, quiet used to be hardcoded at 4, * we're now allowing both to be set from kernel config. */ #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET int match_devname_and_update_preferred_console(const char *match, const char *name, const short idx); extern int console_printk[]; #define console_loglevel (console_printk[0]) #define default_message_loglevel (console_printk[1]) #define minimum_console_loglevel (console_printk[2]) #define default_console_loglevel (console_printk[3]) extern void console_verbose(void); /* strlen("ratelimit") + 1 */ #define DEVKMSG_STR_MAX_SIZE 10 extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE]; struct ctl_table; extern int suppress_printk; struct va_format { const char *fmt; va_list *va; }; /* * FW_BUG * Add this to a message where you are sure the firmware is buggy or behaves * really stupid or out of spec. Be aware that the responsible BIOS developer * should be able to fix this issue or at least get a concrete idea of the * problem by reading your message without the need of looking at the kernel * code. * * Use it for definite and high priority BIOS bugs. * * FW_WARN * Use it for not that clear (e.g. could the kernel messed up things already?) * and medium priority BIOS bugs. * * FW_INFO * Use this one if you want to tell the user or vendor about something * suspicious, but generally harmless related to the firmware. * * Use it for information or very low priority BIOS bugs. */ #define FW_BUG "[Firmware Bug]: " #define FW_WARN "[Firmware Warn]: " #define FW_INFO "[Firmware Info]: " /* * HW_ERR * Add this to a message for hardware errors, so that user can report * it to hardware vendor instead of LKML or software vendor. */ #define HW_ERR "[Hardware Error]: " /* * DEPRECATED * Add this to a message whenever you want to warn user space about the use * of a deprecated aspect of an API so they can stop using it */ #define DEPRECATED "[Deprecated]: " /* * Dummy printk for disabled debugging statements to use whilst maintaining * gcc's format checking. */ #define no_printk(fmt, ...) \ ({ \ if (0) \ _printk(fmt, ##__VA_ARGS__); \ 0; \ }) #ifdef CONFIG_EARLY_PRINTK extern asmlinkage __printf(1, 2) void early_printk(const char *fmt, ...); #else static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif struct dev_printk_info; #ifdef CONFIG_PRINTK asmlinkage __printf(4, 0) int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args); asmlinkage __printf(1, 0) int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int _printk(const char *fmt, ...); /* * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); extern void __printk_deferred_enter(void); extern void __printk_deferred_exit(void); /* * The printk_deferred_enter/exit macros are available only as a hack for * some code paths that need to defer all printk console printing. Interrupts * must be disabled for the deferred duration. */ #define printk_deferred_enter() __printk_deferred_enter() #define printk_deferred_exit() __printk_deferred_exit() /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use * printk_ratelimited() or plain old __ratelimit(). */ extern int __printk_ratelimit(const char *func); #define printk_ratelimit() __printk_ratelimit(__func__) extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); extern int printk_delay_msec; extern int dmesg_restrict; extern void wake_up_klogd(void); char *log_buf_addr_get(void); u32 log_buf_len_get(void); void log_buf_vmcoreinfo_setup(void); void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); void console_try_replay_all(void); void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) { return 0; } static inline __printf(1, 2) __cold int _printk(const char *s, ...) { return 0; } static inline __printf(1, 2) __cold int _printk_deferred(const char *s, ...) { return 0; } static inline void printk_deferred_enter(void) { } static inline void printk_deferred_exit(void) { } static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec) { return false; } static inline void wake_up_klogd(void) { } static inline char *log_buf_addr_get(void) { return NULL; } static inline u32 log_buf_len_get(void) { return 0; } static inline void log_buf_vmcoreinfo_setup(void) { } static inline void setup_log_buf(int early) { } static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...) { } static inline void dump_stack_print_info(const char *log_lvl) { } static inline void show_regs_print_info(const char *log_lvl) { } static inline void dump_stack_lvl(const char *log_lvl) { } static inline void dump_stack(void) { } static inline void printk_trigger_flush(void) { } static inline void console_try_replay_all(void) { } static inline void printk_legacy_allow_panic_sync(void) { } static inline bool nbcon_device_try_acquire(struct console *con) { return false; } static inline void nbcon_device_release(struct console *con) { } static inline void nbcon_atomic_flush_unsafe(void) { } #endif bool this_cpu_in_panic(void); #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); extern void __printk_cpu_sync_put(void); #else #define __printk_cpu_sync_try_get() true #define __printk_cpu_sync_wait() #define __printk_cpu_sync_put() #endif /* CONFIG_SMP */ /** * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk * cpu-reentrant spinning lock. * @flags: Stack-allocated storage for saving local interrupt state, * to be passed to printk_cpu_sync_put_irqrestore(). * * If the lock is owned by another CPU, spin until it becomes available. * Interrupts are restored while spinning. * * CAUTION: This function must be used carefully. It does not behave like a * typical lock. Here are important things to watch out for... * * * This function is reentrant on the same CPU. Therefore the calling * code must not assume exclusive access to data if code accessing the * data can run reentrant or within NMI context on the same CPU. * * * If there exists usage of this function from NMI context, it becomes * unsafe to perform any type of locking or spinning to wait for other * CPUs after calling this function from any context. This includes * using spinlocks or any other busy-waiting synchronization methods. */ #define printk_cpu_sync_get_irqsave(flags) \ for (;;) { \ local_irq_save(flags); \ if (__printk_cpu_sync_try_get()) \ break; \ local_irq_restore(flags); \ __printk_cpu_sync_wait(); \ } /** * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning * lock and restore interrupts. * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave(). */ #define printk_cpu_sync_put_irqrestore(flags) \ do { \ __printk_cpu_sync_put(); \ local_irq_restore(flags); \ } while (0) extern int kptr_restrict; /** * pr_fmt - used by the pr_*() macros to generate the printk format string * @fmt: format string passed from a pr_*() macro * * This macro can be used to generate a unified format string for pr_*() * macros. A common use is to prefix all pr_*() messages in a file with a common * string. For example, defining this at the top of a source file: * * #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt * * would prefix all pr_info, pr_emerg... messages in the file with the module * name. */ #ifndef pr_fmt #define pr_fmt(fmt) fmt #endif struct module; #ifdef CONFIG_PRINTK_INDEX struct pi_entry { const char *fmt; const char *func; const char *file; unsigned int line; /* * While printk and pr_* have the level stored in the string at compile * time, some subsystems dynamically add it at runtime through the * format string. For these dynamic cases, we allow the subsystem to * tell us the level at compile time. * * NULL indicates that the level, if any, is stored in fmt. */ const char *level; /* * The format string used by various subsystem specific printk() * wrappers to prefix the message. * * Note that the static prefix defined by the pr_fmt() macro is stored * directly in the message format (@fmt), not here. */ const char *subsys_fmt_prefix; } __packed; #define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix) \ do { \ if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \ /* * We check __builtin_constant_p multiple times here * for the same input because GCC will produce an error * if we try to assign a static variable to fmt if it * is not a constant, even with the outer if statement. */ \ static const struct pi_entry _entry \ __used = { \ .fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \ .func = __func__, \ .file = __FILE__, \ .line = __LINE__, \ .level = __builtin_constant_p(_level) ? (_level) : NULL, \ .subsys_fmt_prefix = _subsys_fmt_prefix,\ }; \ static const struct pi_entry *_entry_ptr \ __used __section(".printk_index") = &_entry; \ } \ } while (0) #else /* !CONFIG_PRINTK_INDEX */ #define __printk_index_emit(...) do {} while (0) #endif /* CONFIG_PRINTK_INDEX */ /* * Some subsystems have their own custom printk that applies a va_format to a * generic format, for example, to include a device number or other metadata * alongside the format supplied by the caller. * * In order to store these in the way they would be emitted by the printk * infrastructure, the subsystem provides us with the start, fixed string, and * any subsequent text in the format string. * * We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed * as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the * first one. * * subsys_fmt_prefix must be known at compile time, or compilation will fail * (since this is a mistake). If fmt or level is not known at compile time, no * index entry will be made (since this can legitimately happen). */ #define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \ __printk_index_emit(fmt, level, subsys_fmt_prefix) #define printk_index_wrap(_p_func, _fmt, ...) \ ({ \ __printk_index_emit(_fmt, NULL, NULL); \ _p_func(_fmt, ##__VA_ARGS__); \ }) /** * printk - print a kernel message * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. * * If printk indexing is enabled, _printk() is called from printk_index_wrap. * Otherwise, printk is simply #defined to _printk. * * We try to grab the console_lock. If we succeed, it's easy - we log the * output and call the console drivers. If we fail to get the semaphore, we * place the output into the log buffer and return. The current holder of * the console_sem will notice the new output in console_unlock(); and will * send it to the consoles before releasing the lock. * * One effect of this deferred printing is that code which calls printk() and * then changes console_loglevel may break. This is because console_loglevel * is inspected when the actual printing occurs. * * See also: * printf(3) * * See the vsnprintf() documentation for format string extensions over C99. */ #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__) #define printk_deferred(fmt, ...) \ printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__) /** * pr_emerg - Print an emergency-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) /** * pr_alert - Print an alert-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_alert(fmt, ...) \ printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) /** * pr_crit - Print a critical-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_crit(fmt, ...) \ printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) /** * pr_err - Print an error-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_err(fmt, ...) \ printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) /** * pr_warn - Print a warning-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt() * to generate the format string. */ #define pr_warn(fmt, ...) \ printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) /** * pr_notice - Print a notice-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_notice(fmt, ...) \ printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) /** * pr_info - Print an info-level message * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to * generate the format string. */ #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /** * pr_cont - Continues a previous log message in the same line. * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_CONT loglevel. It should only be * used when continuing a log message with no newline ('\n') enclosed. Otherwise * it defaults back to KERN_DEFAULT loglevel. */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) /** * pr_devel - Print a debug-level message conditionally * @fmt: format string * @...: arguments for the format string * * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is * defined. Otherwise it does nothing. * * It uses pr_fmt() to generate the format string. */ #ifdef DEBUG #define pr_devel(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include <linux/dynamic_debug.h> /** * pr_debug - Print a debug-level message conditionally * @fmt: format string * @...: arguments for the format string * * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing. * * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses * pr_fmt() internally). */ #define pr_debug(fmt, ...) \ dynamic_pr_debug(fmt, ##__VA_ARGS__) #elif defined(DEBUG) #define pr_debug(fmt, ...) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* * Print a one-time message (analogous to WARN_ONCE() et al): */ #ifdef CONFIG_PRINTK #define printk_once(fmt, ...) \ DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__) #define printk_deferred_once(fmt, ...) \ DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__) #else #define printk_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #define printk_deferred_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_once(fmt, ...) \ printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert_once(fmt, ...) \ printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit_once(fmt, ...) \ printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) #define pr_err_once(fmt, ...) \ printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn_once(fmt, ...) \ printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) #define pr_notice_once(fmt, ...) \ printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_once(fmt, ...) \ printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* no pr_cont_once, don't do that... */ #if defined(DEBUG) #define pr_devel_once(fmt, ...) \ printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel_once(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(DEBUG) #define pr_debug_once(fmt, ...) \ printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug_once(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* * ratelimited messages with local ratelimit_state, * no local ratelimit_state used in the !PRINTK case */ #ifdef CONFIG_PRINTK #define printk_ratelimited(fmt, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ \ if (__ratelimit(&_rs)) \ printk(fmt, ##__VA_ARGS__); \ }) #else #define printk_ratelimited(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_ratelimited(fmt, ...) \ printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert_ratelimited(fmt, ...) \ printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__) #define pr_crit_ratelimited(fmt, ...) \ printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__) #define pr_err_ratelimited(fmt, ...) \ printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) #define pr_warn_ratelimited(fmt, ...) \ printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) #define pr_notice_ratelimited(fmt, ...) \ printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info_ratelimited(fmt, ...) \ printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) /* no pr_cont_ratelimited, don't do that... */ #if defined(DEBUG) #define pr_devel_ratelimited(fmt, ...) \ printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_devel_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif /* If you are writing a driver, please use dev_dbg instead */ #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt)); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \ __ratelimit(&_rs)) \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #elif defined(DEBUG) #define pr_debug_ratelimited(fmt, ...) \ printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #else #define pr_debug_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif extern const struct file_operations kmsg_fops; enum { DUMP_PREFIX_NONE, DUMP_PREFIX_ADDRESS, DUMP_PREFIX_OFFSET }; extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, char *linebuf, size_t linebuflen, bool ascii); #ifdef CONFIG_PRINTK extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); #else static inline void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { } static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, const void *buf, size_t len) { } #endif #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) #elif defined(DEBUG) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) #else static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { } #endif /** * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @buf: data blob to dump * @len: number of bytes in the @buf * * Calls print_hex_dump(), with log level of KERN_DEBUG, * rowsize of 16, groupsize of 1, and ASCII output included. */ #define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) #endif
83 83 83 1 2 2 2 1 44 6 5 4 5 4 38 38 38 3 38 36 2 2 2 1 38 38 18 18 18 18 2 18 14 12 4 4 4 4 4 16 2 2 2 5 5 5 5 5 5 5 5 5 5 11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 // SPDX-License-Identifier: GPL-2.0-only /* * Common helpers for stackable filesystems and backing files. * * Forked from fs/overlayfs/file.c. * * Copyright (C) 2017 Red Hat, Inc. * Copyright (C) 2023 CTERA Networks. */ #include <linux/fs.h> #include <linux/backing-file.h> #include <linux/splice.h> #include <linux/mm.h> #include "internal.h" /** * backing_file_open - open a backing file for kernel internal use * @user_path: path that the user reuqested to open * @flags: open flags * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). * @user_path may be on the stackable filesystem and @real_path on the * underlying filesystem. In this case, we want to be able to return the * @user_path of the stackable filesystem. This is done by embedding the * returned file into a container structure that also stores the stacked * file's path, which can be retrieved using backing_file_user_path(). */ struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred) { struct file *f; int error; f = alloc_empty_backing_file(flags, cred); if (IS_ERR(f)) return f; path_get(user_path); *backing_file_user_path(f) = *user_path; error = vfs_open(real_path, f); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL_GPL(backing_file_open); struct file *backing_tmpfile_open(const struct path *user_path, int flags, const struct path *real_parentpath, umode_t mode, const struct cred *cred) { struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt); struct file *f; int error; f = alloc_empty_backing_file(flags, cred); if (IS_ERR(f)) return f; path_get(user_path); *backing_file_user_path(f) = *user_path; error = vfs_tmpfile(real_idmap, real_parentpath, f, mode); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL(backing_tmpfile_open); struct backing_aio { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; /* used for aio completion */ void (*end_write)(struct file *); struct work_struct work; long res; }; static struct kmem_cache *backing_aio_cachep; #define BACKING_IOCB_MASK \ (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) static rwf_t iocb_to_rw_flags(int flags) { return (__force rwf_t)(flags & BACKING_IOCB_MASK); } static void backing_aio_put(struct backing_aio *aio) { if (refcount_dec_and_test(&aio->ref)) { fput(aio->iocb.ki_filp); kmem_cache_free(backing_aio_cachep, aio); } } static void backing_aio_cleanup(struct backing_aio *aio, long res) { struct kiocb *iocb = &aio->iocb; struct kiocb *orig_iocb = aio->orig_iocb; if (aio->end_write) aio->end_write(orig_iocb->ki_filp); orig_iocb->ki_pos = iocb->ki_pos; backing_aio_put(aio); } static void backing_aio_rw_complete(struct kiocb *iocb, long res) { struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); struct kiocb *orig_iocb = aio->orig_iocb; if (iocb->ki_flags & IOCB_WRITE) kiocb_end_write(iocb); backing_aio_cleanup(aio, res); orig_iocb->ki_complete(orig_iocb, res); } static void backing_aio_complete_work(struct work_struct *work) { struct backing_aio *aio = container_of(work, struct backing_aio, work); backing_aio_rw_complete(&aio->iocb, aio->res); } static void backing_aio_queue_completion(struct kiocb *iocb, long res) { struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); /* * Punt to a work queue to serialize updates of mtime/size. */ aio->res = res; INIT_WORK(&aio->work, backing_aio_complete_work); queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, &aio->work); } static int backing_aio_init_wq(struct kiocb *iocb) { struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; if (sb->s_dio_done_wq) return 0; return sb_init_dio_done_wq(sb); } ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { struct backing_aio *aio = NULL; const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) return -EIO; if (!iov_iter_count(iter)) return 0; if (iocb->ki_flags & IOCB_DIRECT && !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; old_cred = override_creds(ctx->cred); if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(flags); ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); } else { ret = -ENOMEM; aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); if (!aio) goto out; aio->orig_iocb = iocb; kiocb_clone(&aio->iocb, iocb, get_file(file)); aio->iocb.ki_complete = backing_aio_rw_complete; refcount_set(&aio->ref, 2); ret = vfs_iocb_iter_read(file, &aio->iocb, iter); backing_aio_put(aio); if (ret != -EIOCBQUEUED) backing_aio_cleanup(aio, ret); } out: revert_creds(old_cred); if (ctx->accessed) ctx->accessed(ctx->user_file); return ret; } EXPORT_SYMBOL_GPL(backing_file_read_iter); ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) return -EIO; if (!iov_iter_count(iter)) return 0; ret = file_remove_privs(ctx->user_file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT && !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * Stacked filesystems don't support deferred completions, don't copy * this property in case it is set by the issuer. */ flags &= ~IOCB_DIO_CALLER_COMP; old_cred = override_creds(ctx->cred); if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(flags); ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); if (ctx->end_write) ctx->end_write(ctx->user_file); } else { struct backing_aio *aio; ret = backing_aio_init_wq(iocb); if (ret) goto out; ret = -ENOMEM; aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); if (!aio) goto out; aio->orig_iocb = iocb; aio->end_write = ctx->end_write; kiocb_clone(&aio->iocb, iocb, get_file(file)); aio->iocb.ki_flags = flags; aio->iocb.ki_complete = backing_aio_queue_completion; refcount_set(&aio->ref, 2); ret = vfs_iocb_iter_write(file, &aio->iocb, iter); backing_aio_put(aio); if (ret != -EIOCBQUEUED) backing_aio_cleanup(aio, ret); } out: revert_creds(old_cred); return ret; } EXPORT_SYMBOL_GPL(backing_file_write_iter); ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags, struct backing_file_ctx *ctx) { const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) return -EIO; old_cred = override_creds(ctx->cred); ret = vfs_splice_read(in, ppos, pipe, len, flags); revert_creds(old_cred); if (ctx->accessed) ctx->accessed(ctx->user_file); return ret; } EXPORT_SYMBOL_GPL(backing_file_splice_read); ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, struct backing_file_ctx *ctx) { const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) return -EIO; if (!out->f_op->splice_write) return -EINVAL; ret = file_remove_privs(ctx->user_file); if (ret) return ret; old_cred = override_creds(ctx->cred); file_start_write(out); ret = out->f_op->splice_write(pipe, out, ppos, len, flags); file_end_write(out); revert_creds(old_cred); if (ctx->end_write) ctx->end_write(ctx->user_file); return ret; } EXPORT_SYMBOL_GPL(backing_file_splice_write); int backing_file_mmap(struct file *file, struct vm_area_struct *vma, struct backing_file_ctx *ctx) { const struct cred *old_cred; int ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) || WARN_ON_ONCE(ctx->user_file != vma->vm_file)) return -EIO; if (!file->f_op->mmap) return -ENODEV; vma_set_file(vma, file); old_cred = override_creds(ctx->cred); ret = call_mmap(vma->vm_file, vma); revert_creds(old_cred); if (ctx->accessed) ctx->accessed(ctx->user_file); return ret; } EXPORT_SYMBOL_GPL(backing_file_mmap); static int __init backing_aio_init(void) { backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN); if (!backing_aio_cachep) return -ENOMEM; return 0; } fs_initcall(backing_aio_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_USB_TYPEC_H #define __LINUX_USB_TYPEC_H #include <linux/types.h> /* USB Type-C Specification releases */ #define USB_TYPEC_REV_1_0 0x100 /* 1.0 */ #define USB_TYPEC_REV_1_1 0x110 /* 1.1 */ #define USB_TYPEC_REV_1_2 0x120 /* 1.2 */ #define USB_TYPEC_REV_1_3 0x130 /* 1.3 */ #define USB_TYPEC_REV_1_4 0x140 /* 1.4 */ #define USB_TYPEC_REV_2_0 0x200 /* 2.0 */ struct typec_partner; struct typec_cable; struct typec_plug; struct typec_port; struct typec_altmode_ops; struct typec_cable_ops; struct fwnode_handle; struct device; struct usb_power_delivery; struct usb_power_delivery_desc; enum typec_port_type { TYPEC_PORT_SRC, TYPEC_PORT_SNK, TYPEC_PORT_DRP, }; enum typec_port_data { TYPEC_PORT_DFP, TYPEC_PORT_UFP, TYPEC_PORT_DRD, }; enum typec_plug_type { USB_PLUG_NONE, USB_PLUG_TYPE_A, USB_PLUG_TYPE_B, USB_PLUG_TYPE_C, USB_PLUG_CAPTIVE, }; enum typec_data_role { TYPEC_DEVICE, TYPEC_HOST, }; enum typec_role { TYPEC_SINK, TYPEC_SOURCE, }; static inline int is_sink(enum typec_role role) { return role == TYPEC_SINK; } static inline int is_source(enum typec_role role) { return role == TYPEC_SOURCE; } enum typec_pwr_opmode { TYPEC_PWR_MODE_USB, TYPEC_PWR_MODE_1_5A, TYPEC_PWR_MODE_3_0A, TYPEC_PWR_MODE_PD, }; enum typec_accessory { TYPEC_ACCESSORY_NONE, TYPEC_ACCESSORY_AUDIO, TYPEC_ACCESSORY_DEBUG, }; #define TYPEC_MAX_ACCESSORY 3 enum typec_orientation { TYPEC_ORIENTATION_NONE, TYPEC_ORIENTATION_NORMAL, TYPEC_ORIENTATION_REVERSE, }; /* * struct enter_usb_data - Enter_USB Message details * @eudo: Enter_USB Data Object * @active_link_training: Active Cable Plug Link Training * * @active_link_training is a flag that should be set with uni-directional SBRX * communication, and left 0 with passive cables and with bi-directional SBRX * communication. */ struct enter_usb_data { u32 eudo; unsigned char active_link_training:1; }; /* * struct usb_pd_identity - USB Power Delivery identity data * @id_header: ID Header VDO * @cert_stat: Cert Stat VDO * @product: Product VDO * @vdo: Product Type Specific VDOs * * USB power delivery Discover Identity command response data. * * REVISIT: This is USB Power Delivery specific information, so this structure * probable belongs to USB Power Delivery header file once we have them. */ struct usb_pd_identity { u32 id_header; u32 cert_stat; u32 product; u32 vdo[3]; }; int typec_partner_set_identity(struct typec_partner *partner); int typec_cable_set_identity(struct typec_cable *cable); /* * struct typec_altmode_desc - USB Type-C Alternate Mode Descriptor * @svid: Standard or Vendor ID * @mode: Index of the Mode * @vdo: VDO returned by Discover Modes USB PD command * @roles: Only for ports. DRP if the mode is available in both roles * * Description of an Alternate Mode which a connector, cable plug or partner * supports. */ struct typec_altmode_desc { u16 svid; u8 mode; u32 vdo; /* Only used with ports */ enum typec_port_data roles; }; void typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision); int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmodes); struct typec_altmode *typec_partner_register_altmode(struct typec_partner *partner, const struct typec_altmode_desc *desc); int typec_plug_set_num_altmodes(struct typec_plug *plug, int num_altmodes); struct typec_altmode *typec_plug_register_altmode(struct typec_plug *plug, const struct typec_altmode_desc *desc); struct typec_altmode *typec_port_register_altmode(struct typec_port *port, const struct typec_altmode_desc *desc); void typec_port_register_altmodes(struct typec_port *port, const struct typec_altmode_ops *ops, void *drvdata, struct typec_altmode **altmodes, size_t n); void typec_port_register_cable_ops(struct typec_altmode **altmodes, int max_altmodes, const struct typec_cable_ops *ops); void typec_unregister_altmode(struct typec_altmode *altmode); struct typec_port *typec_altmode2port(struct typec_altmode *alt); void typec_altmode_update_active(struct typec_altmode *alt, bool active); void typec_altmode_set_ops(struct typec_altmode *alt, const struct typec_altmode_ops *ops); enum typec_plug_index { TYPEC_PLUG_SOP_P, TYPEC_PLUG_SOP_PP, }; /* * struct typec_plug_desc - USB Type-C Cable Plug Descriptor * @index: SOP Prime for the plug connected to DFP and SOP Double Prime for the * plug connected to UFP * * Represents USB Type-C Cable Plug. */ struct typec_plug_desc { enum typec_plug_index index; }; /* * struct typec_cable_desc - USB Type-C Cable Descriptor * @type: The plug type from USB PD Cable VDO * @active: Is the cable active or passive * @identity: Result of Discover Identity command * @pd_revision: USB Power Delivery Specification revision if supported * * Represents USB Type-C Cable attached to USB Type-C port. */ struct typec_cable_desc { enum typec_plug_type type; unsigned int active:1; struct usb_pd_identity *identity; u16 pd_revision; /* 0300H = "3.0" */ }; /* * struct typec_partner_desc - USB Type-C Partner Descriptor * @usb_pd: USB Power Delivery support * @accessory: Audio, Debug or none. * @identity: Discover Identity command data * @pd_revision: USB Power Delivery Specification Revision if supported * @attach: Notification about attached USB device * @deattach: Notification about removed USB device * * Details about a partner that is attached to USB Type-C port. If @identity * member exists when partner is registered, a directory named "identity" is * created to sysfs for the partner device. * * @pd_revision is based on the setting of the "Specification Revision" field * in the message header on the initial "Source Capabilities" message received * from the partner, or a "Request" message received from the partner, depending * on whether our port is a Sink or a Source. */ struct typec_partner_desc { unsigned int usb_pd:1; enum typec_accessory accessory; struct usb_pd_identity *identity; u16 pd_revision; /* 0300H = "3.0" */ void (*attach)(struct typec_partner *partner, struct device *dev); void (*deattach)(struct typec_partner *partner, struct device *dev); }; /** * struct typec_operations - USB Type-C Port Operations * @try_role: Set data role preference for DRP port * @dr_set: Set Data Role * @pr_set: Set Power Role * @vconn_set: Source VCONN * @port_type_set: Set port type * @pd_get: Get available USB Power Delivery Capabilities. * @pd_set: Set USB Power Delivery Capabilities. */ struct typec_operations { int (*try_role)(struct typec_port *port, int role); int (*dr_set)(struct typec_port *port, enum typec_data_role role); int (*pr_set)(struct typec_port *port, enum typec_role role); int (*vconn_set)(struct typec_port *port, enum typec_role role); int (*port_type_set)(struct typec_port *port, enum typec_port_type type); struct usb_power_delivery **(*pd_get)(struct typec_port *port); int (*pd_set)(struct typec_port *port, struct usb_power_delivery *pd); }; enum usb_pd_svdm_ver { SVDM_VER_1_0 = 0, SVDM_VER_2_0 = 1, SVDM_VER_MAX = SVDM_VER_2_0, }; /* * struct typec_capability - USB Type-C Port Capabilities * @type: Supported power role of the port * @data: Supported data role of the port * @revision: USB Type-C Specification release. Binary coded decimal * @pd_revision: USB Power Delivery Specification revision if supported * @svdm_version: USB PD Structured VDM version if supported * @prefer_role: Initial role preference (DRP ports). * @accessory: Supported Accessory Modes * @fwnode: Optional fwnode of the port * @driver_data: Private pointer for driver specific info * @pd: Optional USB Power Delivery Support * @ops: Port operations vector * * Static capabilities of a single USB Type-C port. */ struct typec_capability { enum typec_port_type type; enum typec_port_data data; u16 revision; /* 0120H = "1.2" */ u16 pd_revision; /* 0300H = "3.0" */ enum usb_pd_svdm_ver svdm_version; int prefer_role; enum typec_accessory accessory[TYPEC_MAX_ACCESSORY]; unsigned int orientation_aware:1; struct fwnode_handle *fwnode; void *driver_data; struct usb_power_delivery *pd; const struct typec_operations *ops; }; /* Specific to try_role(). Indicates the user want's to clear the preference. */ #define TYPEC_NO_PREFERRED_ROLE (-1) struct typec_port *typec_register_port(struct device *parent, const struct typec_capability *cap); void typec_unregister_port(struct typec_port *port); struct typec_partner *typec_register_partner(struct typec_port *port, struct typec_partner_desc *desc); void typec_unregister_partner(struct typec_partner *partner); struct typec_cable *typec_register_cable(struct typec_port *port, struct typec_cable_desc *desc); void typec_unregister_cable(struct typec_cable *cable); struct typec_cable *typec_cable_get(struct typec_port *port); void typec_cable_put(struct typec_cable *cable); int typec_cable_is_active(struct typec_cable *cable); struct typec_plug *typec_register_plug(struct typec_cable *cable, struct typec_plug_desc *desc); void typec_unregister_plug(struct typec_plug *plug); void typec_set_data_role(struct typec_port *port, enum typec_data_role role); void typec_set_pwr_role(struct typec_port *port, enum typec_role role); void typec_set_vconn_role(struct typec_port *port, enum typec_role role); void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode); int typec_set_orientation(struct typec_port *port, enum typec_orientation orientation); enum typec_orientation typec_get_orientation(struct typec_port *port); int typec_set_mode(struct typec_port *port, int mode); void *typec_get_drvdata(struct typec_port *port); int typec_get_fw_cap(struct typec_capability *cap, struct fwnode_handle *fwnode); int typec_find_pwr_opmode(const char *name); int typec_find_orientation(const char *name); int typec_find_port_power_role(const char *name); int typec_find_power_role(const char *name); int typec_find_port_data_role(const char *name); void typec_partner_set_svdm_version(struct typec_partner *partner, enum usb_pd_svdm_ver svdm_version); int typec_get_negotiated_svdm_version(struct typec_port *port); int typec_get_cable_svdm_version(struct typec_port *port); void typec_cable_set_svdm_version(struct typec_cable *cable, enum usb_pd_svdm_ver svdm_version); struct usb_power_delivery *typec_partner_usb_power_delivery_register(struct typec_partner *partner, struct usb_power_delivery_desc *desc); int typec_port_set_usb_power_delivery(struct typec_port *port, struct usb_power_delivery *pd); int typec_partner_set_usb_power_delivery(struct typec_partner *partner, struct usb_power_delivery *pd); /** * struct typec_connector - Representation of Type-C port for external drivers * @attach: notification about device removal * @deattach: notification about device removal * * Drivers that control the USB and other ports (DisplayPorts, etc.), that are * connected to the Type-C connectors, can use these callbacks to inform the * Type-C connector class about connections and disconnections. That information * can then be used by the typec-port drivers to power on or off parts that are * needed or not needed - as an example, in USB mode if USB2 device is * enumerated, USB3 components (retimers, phys, and what have you) do not need * to be powered on. * * The attached (enumerated) devices will be liked with the typec-partner device. */ struct typec_connector { void (*attach)(struct typec_connector *con, struct device *dev); void (*deattach)(struct typec_connector *con, struct device *dev); }; static inline void typec_attach(struct typec_connector *con, struct device *dev) { if (con && con->attach) con->attach(con, dev); } static inline void typec_deattach(struct typec_connector *con, struct device *dev) { if (con && con->deattach) con->deattach(con, dev); } #endif /* __LINUX_USB_TYPEC_H */
14 6 18 14 14 14 14 14 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 // SPDX-License-Identifier: GPL-2.0-or-later /* * geniv: Shared IV generator code * * This file provides common code to IV generators such as seqiv. * * Copyright (c) 2007-2019 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/geniv.h> #include <crypto/internal/rng.h> #include <crypto/null.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> static int aead_geniv_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm); return crypto_aead_setkey(ctx->child, key, keylen); } static int aead_geniv_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm); return crypto_aead_setauthsize(ctx->child, authsize); } static void aead_geniv_free(struct aead_instance *inst) { crypto_drop_aead(aead_instance_ctx(inst)); kfree(inst); } struct aead_instance *aead_geniv_alloc(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_aead_spawn *spawn; struct aead_instance *inst; struct aead_alg *alg; unsigned int ivsize; unsigned int maxauthsize; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = aead_instance_ctx(inst); err = crypto_grab_aead(spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(spawn); ivsize = crypto_aead_alg_ivsize(alg); maxauthsize = crypto_aead_alg_maxauthsize(alg); err = -EINVAL; if (ivsize < sizeof(u64)) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", tmpl->name, alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", tmpl->name, alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = alg->base.cra_blocksize; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct aead_geniv_ctx); inst->alg.setkey = aead_geniv_setkey; inst->alg.setauthsize = aead_geniv_setauthsize; inst->alg.ivsize = ivsize; inst->alg.maxauthsize = maxauthsize; inst->free = aead_geniv_free; out: return inst; err_free_inst: aead_geniv_free(inst); inst = ERR_PTR(err); goto out; } EXPORT_SYMBOL_GPL(aead_geniv_alloc); int aead_init_geniv(struct crypto_aead *aead) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(aead); struct aead_instance *inst = aead_alg_instance(aead); struct crypto_aead *child; int err; spin_lock_init(&ctx->lock); err = crypto_get_default_rng(); if (err) goto out; err = crypto_rng_get_bytes(crypto_default_rng, ctx->salt, crypto_aead_ivsize(aead)); crypto_put_default_rng(); if (err) goto out; ctx->sknull = crypto_get_default_null_skcipher(); err = PTR_ERR(ctx->sknull); if (IS_ERR(ctx->sknull)) goto out; child = crypto_spawn_aead(aead_instance_ctx(inst)); err = PTR_ERR(child); if (IS_ERR(child)) goto drop_null; ctx->child = child; crypto_aead_set_reqsize(aead, crypto_aead_reqsize(child) + sizeof(struct aead_request)); err = 0; out: return err; drop_null: crypto_put_default_null_skcipher(); goto out; } EXPORT_SYMBOL_GPL(aead_init_geniv); void aead_exit_geniv(struct crypto_aead *tfm) { struct aead_geniv_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); crypto_put_default_null_skcipher(); } EXPORT_SYMBOL_GPL(aead_exit_geniv); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Shared IV generator code");
5 5 16 13 15 15 14 9 6 15 1409 1 7 7 1406 15 16 15 15 13 14 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; struct crypto_shash *siw_crypto_shash; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static void siw_destroy_cpulist(int number) { int i = 0; while (i < number) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; } static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; siw_destroy_cpulist(i); return -ENOMEM; } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; sdev->netdev = netdev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_UP: sdev->state = IB_PORT_ACTIVE; siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); break; case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * Todo: Below netdev events are currently not handled. */ case NETDEV_CHANGEMTU: case NETDEV_CHANGE: break; default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); if (netif_running(netdev) && netif_carrier_ok(netdev)) sdev->state = IB_PORT_ACTIVE; else sdev->state = IB_PORT_DOWN; ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } /* * Locate CRC32 algorithm. If unsuccessful, fail * loading siw only, if CRC is required. */ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(siw_crypto_shash)) { pr_info("siw: Loading CRC32c failed: %ld\n", PTR_ERR(siw_crypto_shash)); siw_crypto_shash = NULL; if (mpa_crc_required) { rv = -EOPNOTSUPP; goto out_error; } } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); if (siw_crypto_shash) crypto_free_shash(siw_crypto_shash); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw");
4 9 1 7 1509 1510 124 1268 102 3 5 5 581 580 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 /* SPDX-License-Identifier: GPL-2.0 */ /* * Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #ifndef __LINUX_NEXTHOP_H #define __LINUX_NEXTHOP_H #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/route.h> #include <linux/types.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/netlink.h> #define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK struct nexthop; struct nh_config { u32 nh_id; u8 nh_family; u8 nh_protocol; u8 nh_blackhole; u8 nh_fdb; u32 nh_flags; int nh_ifindex; struct net_device *dev; union { __be32 ipv4; struct in6_addr ipv6; } gw; struct nlattr *nh_grp; u16 nh_grp_type; u16 nh_grp_res_num_buckets; unsigned long nh_grp_res_idle_timer; unsigned long nh_grp_res_unbalanced_timer; bool nh_grp_res_has_num_buckets; bool nh_grp_res_has_idle_timer; bool nh_grp_res_has_unbalanced_timer; bool nh_hw_stats; struct nlattr *nh_encap; u16 nh_encap_type; u32 nlflags; struct nl_info nlinfo; }; struct nh_info { struct hlist_node dev_hash; /* entry on netns devhash */ struct nexthop *nh_parent; u8 family; bool reject_nh; bool fdb_nh; union { struct fib_nh_common fib_nhc; struct fib_nh fib_nh; struct fib6_nh fib6_nh; }; }; struct nh_res_bucket { struct nh_grp_entry __rcu *nh_entry; atomic_long_t used_time; unsigned long migrated_time; bool occupied; u8 nh_flags; }; struct nh_res_table { struct net *net; u32 nhg_id; struct delayed_work upkeep_dw; /* List of NHGEs that have too few buckets ("uw" for underweight). * Reclaimed buckets will be given to entries in this list. */ struct list_head uw_nh_entries; unsigned long unbalanced_since; u32 idle_timer; u32 unbalanced_timer; u16 num_nh_buckets; struct nh_res_bucket nh_buckets[] __counted_by(num_nh_buckets); }; struct nh_grp_entry_stats { u64_stats_t packets; struct u64_stats_sync syncp; }; struct nh_grp_entry { struct nexthop *nh; struct nh_grp_entry_stats __percpu *stats; u16 weight; union { struct { atomic_t upper_bound; } hthr; struct { /* Member on uw_nh_entries. */ struct list_head uw_nh_entry; u16 count_buckets; u16 wants_buckets; } res; }; struct list_head nh_list; struct nexthop *nh_parent; /* nexthop of group with this entry */ u64 packets_hw; }; struct nh_group { struct nh_group *spare; /* spare group for removals */ u16 num_nh; bool is_multipath; bool hash_threshold; bool resilient; bool fdb_nh; bool has_v4; bool hw_stats; struct nh_res_table __rcu *res_table; struct nh_grp_entry nh_entries[] __counted_by(num_nh); }; struct nexthop { struct rb_node rb_node; /* entry on netns rbtree */ struct list_head fi_list; /* v4 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */ struct list_head fdb_list; /* fdb entries using this nh */ struct list_head grp_list; /* nh group entries using this nh */ struct net *net; u32 id; u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; refcount_t refcnt; struct rcu_head rcu; union { struct nh_info __rcu *nh_info; struct nh_group __rcu *nh_grp; }; }; enum nexthop_event_type { NEXTHOP_EVENT_DEL, NEXTHOP_EVENT_REPLACE, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, NEXTHOP_EVENT_BUCKET_REPLACE, NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, }; enum nh_notifier_info_type { NH_NOTIFIER_INFO_TYPE_SINGLE, NH_NOTIFIER_INFO_TYPE_GRP, NH_NOTIFIER_INFO_TYPE_RES_TABLE, NH_NOTIFIER_INFO_TYPE_RES_BUCKET, NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS, }; struct nh_notifier_single_info { struct net_device *dev; u8 gw_family; union { __be32 ipv4; struct in6_addr ipv6; }; u32 id; u8 is_reject:1, is_fdb:1, has_encap:1; }; struct nh_notifier_grp_entry_info { u16 weight; struct nh_notifier_single_info nh; }; struct nh_notifier_grp_info { u16 num_nh; bool is_fdb; bool hw_stats; struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh); }; struct nh_notifier_res_bucket_info { u16 bucket_index; unsigned int idle_timer_ms; bool force; struct nh_notifier_single_info old_nh; struct nh_notifier_single_info new_nh; }; struct nh_notifier_res_table_info { u16 num_nh_buckets; bool hw_stats; struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets); }; struct nh_notifier_grp_hw_stats_entry_info { u32 id; u64 packets; }; struct nh_notifier_grp_hw_stats_info { u16 num_nh; bool hw_stats_used; struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh); }; struct nh_notifier_info { struct net *net; struct netlink_ext_ack *extack; u32 id; enum nh_notifier_info_type type; union { struct nh_notifier_single_info *nh; struct nh_notifier_grp_info *nh_grp; struct nh_notifier_res_table_info *nh_res_table; struct nh_notifier_res_bucket_info *nh_res_bucket; struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats; }; }; int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity); void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, unsigned int nh_idx, u64 delta_packets); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); void nexthop_free_rcu(struct rcu_head *head); static inline bool nexthop_get(struct nexthop *nh) { return refcount_inc_not_zero(&nh->refcnt); } static inline void nexthop_put(struct nexthop *nh) { if (refcount_dec_and_test(&nh->refcnt)) call_rcu_hurry(&nh->rcu, nexthop_free_rcu); } static inline bool nexthop_cmp(const struct nexthop *nh1, const struct nexthop *nh2) { return nh1 == nh2; } static inline bool nexthop_is_fdb(const struct nexthop *nh) { if (nh->is_group) { const struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->fdb_nh; } else { const struct nh_info *nhi; nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->fdb_nh; } } static inline bool nexthop_has_v4(const struct nexthop *nh) { if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->has_v4; } return false; } static inline bool nexthop_is_multipath(const struct nexthop *nh) { if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->is_multipath; } return false; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash); static inline unsigned int nexthop_num_path(const struct nexthop *nh) { unsigned int rc = 1; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->is_multipath) rc = nh_grp->num_nh; } return rc; } static inline struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel) { /* for_nexthops macros in fib_semantics.c grabs a pointer to * the nexthop before checking nhsel */ if (nhsel >= nhg->num_nh) return NULL; return nhg->nh_entries[nhsel].nh; } static inline int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, u8 rt_family) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info); struct fib_nh_common *nhc = &nhi->fib_nhc; int weight = nhg->nh_entries[i].weight; if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0) return -EMSGSIZE; } return 0; } /* called with rcu lock */ static inline bool nexthop_is_blackhole(const struct nexthop *nh) { const struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->num_nh > 1) return false; nh = nh_grp->nh_entries[0].nh; } nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->reject_nh; } static inline void nexthop_path_fib_result(struct fib_result *res, int hash) { struct nh_info *nhi; struct nexthop *nh; nh = nexthop_select_path(res->fi->nh, hash); nhi = rcu_dereference(nh->nh_info); res->nhc = &nhi->fib_nhc; } /* called with rcu read lock or rtnl held */ static inline struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) { struct nh_info *nhi; BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0); BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0); if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->is_multipath) { nh = nexthop_mpath_select(nh_grp, nhsel); if (!nh) return NULL; } } nhi = rcu_dereference_rtnl(nh->nh_info); return &nhi->fib_nhc; } /* called from fib_table_lookup with rcu_lock */ static inline struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh, int fib_flags, const struct flowi4 *flp, int *nhsel) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nhg = rcu_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; nhi = rcu_dereference(nhe->nh_info); if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { *nhsel = i; return &nhi->fib_nhc; } } } else { nhi = rcu_dereference(nh->nh_info); if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { *nhsel = 0; return &nhi->fib_nhc; } } return NULL; } static inline bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nhg = rcu_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; nhi = rcu_dereference(nhe->nh_info); if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) return true; } } else { nhi = rcu_dereference(nh->nh_info); if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) return true; } return false; } static inline unsigned int fib_info_num_path(const struct fib_info *fi) { if (unlikely(fi->nh)) return nexthop_num_path(fi->nh); return fi->fib_nhs; } int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack); static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel) { if (unlikely(fi->nh)) return nexthop_fib_nhc(fi->nh, nhsel); return &fi->fib_nh[nhsel].nh_common; } /* only used when fib_nh is built into fib_info */ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel) { WARN_ON(fi->nh); return &fi->fib_nh[nhsel]; } /* * IPv6 variants */ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack); /* Caller should either hold rcu_read_lock(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); nh = nexthop_mpath_select(nh_grp, 0); if (!nh) return NULL; } nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET6) return &nhi->fib6_nh; return NULL; } static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i) { struct fib6_nh *fib6_nh; fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh; return fib6_nh->fib_nh_dev; } static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash) { struct nexthop *nh = res->f6i->nh; struct nh_info *nhi; nh = nexthop_select_path(nh, hash); nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->reject_nh) { res->fib6_type = RTN_BLACKHOLE; res->fib6_flags |= RTF_REJECT; res->nh = nexthop_fib6_nh(nh); } else { res->nh = &nhi->fib6_nh; } } int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg); static inline int nexthop_get_family(struct nexthop *nh) { struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->family; } static inline struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh) { struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); return &nhi->fib_nhc; } static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh, int hash) { struct nh_info *nhi; struct nexthop *nhp; nhp = nexthop_select_path(nh, hash); if (unlikely(!nhp)) return NULL; nhi = rcu_dereference(nhp->nh_info); return &nhi->fib_nhc; } #endif
53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM icmp #if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ICMP_H #include <linux/icmp.h> #include <linux/tracepoint.h> TRACE_EVENT(icmp_send, TP_PROTO(const struct sk_buff *skb, int type, int code), TP_ARGS(skb, type, code), TP_STRUCT__entry( __field(const void *, skbaddr) __field(int, type) __field(int, code) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __field(__u16, sport) __field(__u16, dport) __field(unsigned short, ulen) ), TP_fast_assign( struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = udp_hdr(skb); int proto_4 = iph->protocol; __be32 *p32; __entry->skbaddr = skb; __entry->type = type; __entry->code = code; if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head || (u8 *)uh + sizeof(struct udphdr) > skb_tail_pointer(skb)) { __entry->sport = 0; __entry->dport = 0; __entry->ulen = 0; } else { __entry->sport = ntohs(uh->source); __entry->dport = ntohs(uh->dest); __entry->ulen = ntohs(uh->len); } p32 = (__be32 *) __entry->saddr; *p32 = iph->saddr; p32 = (__be32 *) __entry->daddr; *p32 = iph->daddr; ), TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to %pI4:%u ulen=%d skbaddr=%p", __entry->type, __entry->code, __entry->saddr, __entry->sport, __entry->daddr, __entry->dport, __entry->ulen, __entry->skbaddr) ); #endif /* _TRACE_ICMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
13 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * symlink.h * * Function prototypes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef OCFS2_SYMLINK_H #define OCFS2_SYMLINK_H extern const struct inode_operations ocfs2_symlink_inode_operations; extern const struct address_space_operations ocfs2_fast_symlink_aops; /* * Test whether an inode is a fast symlink. */ static inline int ocfs2_inode_is_fast_symlink(struct inode *inode) { return (S_ISLNK(inode->i_mode) && inode->i_blocks == 0); } #endif /* OCFS2_SYMLINK_H */
7 7 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0 */ /* * DES & Triple DES EDE key verification helpers */ #ifndef __CRYPTO_INTERNAL_DES_H #define __CRYPTO_INTERNAL_DES_H #include <linux/crypto.h> #include <linux/fips.h> #include <crypto/des.h> #include <crypto/aead.h> #include <crypto/skcipher.h> /** * crypto_des_verify_key - Check whether a DES key is weak * @tfm: the crypto algo * @key: the key buffer * * Returns -EINVAL if the key is weak and the crypto TFM does not permit weak * keys. Otherwise, 0 is returned. * * It is the job of the caller to ensure that the size of the key equals * DES_KEY_SIZE. */ static inline int crypto_des_verify_key(struct crypto_tfm *tfm, const u8 *key) { struct des_ctx tmp; int err; err = des_expand_key(&tmp, key, DES_KEY_SIZE); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } memzero_explicit(&tmp, sizeof(tmp)); return err; } /* * RFC2451: * * For DES-EDE3, there is no known need to reject weak or * complementation keys. Any weakness is obviated by the use of * multiple keys. * * However, if the first two or last two independent 64-bit keys are * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the * same as DES. Implementers MUST reject keys that exhibit this * property. * */ static inline int des3_ede_verify_key(const u8 *key, unsigned int key_len, bool check_weak) { int ret = fips_enabled ? -EINVAL : -ENOKEY; u32 K[6]; memcpy(K, key, DES3_EDE_KEY_SIZE); if ((!((K[0] ^ K[2]) | (K[1] ^ K[3])) || !((K[2] ^ K[4]) | (K[3] ^ K[5]))) && (fips_enabled || check_weak)) goto bad; if ((!((K[0] ^ K[4]) | (K[1] ^ K[5]))) && fips_enabled) goto bad; ret = 0; bad: memzero_explicit(K, DES3_EDE_KEY_SIZE); return ret; } /** * crypto_des3_ede_verify_key - Check whether a DES3-EDE key is weak * @tfm: the crypto algo * @key: the key buffer * * Returns -EINVAL if the key is weak and the crypto TFM does not permit weak * keys or when running in FIPS mode. Otherwise, 0 is returned. Note that some * keys are rejected in FIPS mode even if weak keys are permitted by the TFM * flags. * * It is the job of the caller to ensure that the size of the key equals * DES3_EDE_KEY_SIZE. */ static inline int crypto_des3_ede_verify_key(struct crypto_tfm *tfm, const u8 *key) { return des3_ede_verify_key(key, DES3_EDE_KEY_SIZE, crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); } static inline int verify_skcipher_des_key(struct crypto_skcipher *tfm, const u8 *key) { return crypto_des_verify_key(crypto_skcipher_tfm(tfm), key); } static inline int verify_skcipher_des3_key(struct crypto_skcipher *tfm, const u8 *key) { return crypto_des3_ede_verify_key(crypto_skcipher_tfm(tfm), key); } static inline int verify_aead_des_key(struct crypto_aead *tfm, const u8 *key, int keylen) { if (keylen != DES_KEY_SIZE) return -EINVAL; return crypto_des_verify_key(crypto_aead_tfm(tfm), key); } static inline int verify_aead_des3_key(struct crypto_aead *tfm, const u8 *key, int keylen) { if (keylen != DES3_EDE_KEY_SIZE) return -EINVAL; return crypto_des3_ede_verify_key(crypto_aead_tfm(tfm), key); } #endif /* __CRYPTO_INTERNAL_DES_H */
967 951 115 118 116 116 116 8 8 2 6 116 116 116 116 116 116 116 118 117 118 117 117 118 118 116 118 118 19 19 118 19 118 15 16 16 2 16 115 116 116 116 115 116 116 116 16 16 16 15 15 15 15 15 15 8 15 4 15 15 16 15 115 116 116 116 116 116 116 115 116 135 134 113 25 134 118 119 134 32 32 32 759 760 758 7 1 7 7 3 3 3 2 115 115 114 115 115 114 115 112 115 115 1 114 20 115 131 118 116 26 118 26 26 5 26 3 26 118 130 20 11 20 20 115 115 1 115 115 114 114 114 1 113 115 3 3 3 2 2 2 2 2 2 1 3 3 3 115 19 19 19 19 19 10 19 19 19 115 109 108 106 108 107 108 36 74 74 18 59 115 12 115 115 36 12 12 1 1 1 11 115 115 115 115 112 112 112 140 8 132 1 131 29 118 113 7 117 115 116 115 115 115 110 111 114 115 127 137 78 80 73 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/rseq.h> #include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; /* * Check do_open_execat() for an explanation. */ error = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) goto exit; error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; struct vm_area_struct *vma = bprm->vma; struct mm_struct *mm = bprm->mm; int ret; /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * by hand ahead of time. */ if (write && pos < vma->vm_start) { mmap_write_lock(mm); ret = expand_downwards(vma, pos); if (unlikely(ret < 0)) { mmap_write_unlock(mm); return NULL; } mmap_write_downgrade(mm); } else mmap_read_lock(mm); /* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm. */ ret = get_user_pages_remote(mm, pos, 1, write ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; vma_set_anonymous(vma); if (mmap_write_lock_killable(mm)) { err = -EINTR; goto err_free; } /* * Need to be called with mmap write lock * held, to avoid race with ksmd. */ err = ksm_execve(mm); if (err) goto err_ksm; /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; err: ksm_exit(mm); err_ksm: mmap_write_unlock(mm); err_free: bprm->vma = NULL; vm_area_free(vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static inline int bprm_set_stack_limit(struct linux_binprm *bprm, unsigned long limit) { #ifdef CONFIG_MMU /* Avoid a pathological bprm->p. */ if (bprm->p < limit) return -E2BIG; bprm->argmin = bprm->p - limit; #endif return 0; } static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm) { #ifdef CONFIG_MMU return bprm->p < bprm->argmin; #else return false; #endif } /* * Calculate bprm->argmin from: * - _STK_LIM * - ARG_MAX * - bprm->rlim_stack.rlim_cur * - bprm->argc * - bprm->envc * - bprm->p */ static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* Reject totally pathological counts. */ if (bprm->argc < 0 || bprm->envc < 0) return -E2BIG; /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) || check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) return -E2BIG; if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; return bprm_set_stack_limit(bprm, limit); } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) goto out; while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ if (current->flags & PF_RANDOMIZE) stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; vma_iter_init(&vmi, mm, vma->vm_start); tlb_gather_mmu(&tlb, mm); ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. */ ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; stack_expand = min(rlim_stack, stack_size + stack_expand); #ifdef CONFIG_STACK_GROWSUP stack_base = vma->vm_start + stack_expand; #else stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap_local(src); if (ret) goto out; } bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ /* * On success, caller must call do_close_execat() on the returned * struct file to close it. */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) return file; /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) { fput(file); return ERR_PTR(-EACCES); } return file; } /** * open_exec - Open a path name for execution * * @name: path name to open with the intent of executing it. * * Returns ERR_PTR on failure or allocated struct file on success. * * As this is a wrapper for the internal do_open_execat(). Also see * do_close_execat(). */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec. */ ret = mmap_read_lock_killable(old_mm); if (ret) { up_write(&tsk->signal->exec_update_lock); return ret; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop_lazy_tlb(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); /* Always NUL terminated and zero-padded */ strscpy_pad(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * This tracepoint marks the point before flushing the old exec where * the current task is still unchanged, but errors are fatal (point of * no return). The later "sched_process_exec" tracepoint is called after * the current task has successfully switched to the new exec. */ trace_sched_prepare_exec(current, bprm); /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; retval = exec_task_namespaces(); if (retval) goto out_unlock; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct mnt_idmap *idmap = file_mnt_idmap(file); if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(&current->signal->cred_guard_mutex); return -ENOMEM; } /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { if (file) fput(file); } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(&current->signal->cred_guard_mutex); abort_creds(bprm->cred); } do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { struct linux_binprm *bprm; struct file *file; int retval = -ENOMEM; file = do_open_execat(fd, filename, flags); if (IS_ERR(file)) return ERR_CAST(file); bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { do_close_execat(file); return ERR_PTR(-ENOMEM); } bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!bprm->fdpath) goto out_free; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (!retval) return bprm; out_free: free_bprm(bprm); return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; /* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; int err; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); /* Did the exec bit vanish out from under us? Give up. */ if (err) return; /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) return -EFAULT; kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; return 0; } EXPORT_SYMBOL(remove_arg_zero); #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; } return retval; } /* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } static int bprm_execve(struct linux_binprm *bprm) { int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; /* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated. */ check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current); sched_exec(); /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; sched_mm_cid_after_execve(current); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; } retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return -EINVAL; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif #ifdef CONFIG_SYSCTL static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) validate_coredump_safety(); return error; } static struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); return 0; } fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */ #ifdef CONFIG_EXEC_KUNIT_TEST #include "tests/exec_kunit.c" #endif
10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 // SPDX-License-Identifier: GPL-2.0 /* * fs-verity module initialization and logging * * Copyright 2019 Google LLC */ #include "fsverity_private.h" #include <linux/ratelimit.h> #ifdef CONFIG_SYSCTL static struct ctl_table fsverity_sysctl_table[] = { #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES { .procname = "require_signatures", .data = &fsverity_require_signatures, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif }; static void __init fsverity_init_sysctl(void) { register_sysctl_init("fs/verity", fsverity_sysctl_table); } #else /* CONFIG_SYSCTL */ static inline void fsverity_init_sysctl(void) { } #endif /* !CONFIG_SYSCTL */ void fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct va_format vaf; va_list args; if (!__ratelimit(&rs)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (inode) printk("%sfs-verity (%s, inode %lu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); else printk("%sfs-verity: %pV\n", level, &vaf); va_end(args); } static int __init fsverity_init(void) { fsverity_check_hash_algs(); fsverity_init_info_cache(); fsverity_init_workqueue(); fsverity_init_sysctl(); fsverity_init_signature(); fsverity_init_bpf(); return 0; } late_initcall(fsverity_init)
4 4 4 78 78 81 81 29 9 9 9 1 6 4 4 3 2 1 1 1 1 49 39 48 48 48 24 80 79 81 81 81 159 29 6 6 6 6 6 5 3 1 48 48 48 48 48 71 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 // SPDX-License-Identifier: GPL-2.0-or-later /* * Software async crypto daemon. * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> * * Added AEAD support to cryptd. * Authors: Tadeusz Struk (tadeusz.struk@intel.com) * Adrian Hoban <adrian.hoban@intel.com> * Gabriele Paoloni <gabriele.paoloni@intel.com> * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. */ #include <crypto/internal/hash.h> #include <crypto/internal/aead.h> #include <crypto/internal/skcipher.h> #include <crypto/cryptd.h> #include <linux/refcount.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/workqueue.h> static unsigned int cryptd_max_cpu_qlen = 1000; module_param(cryptd_max_cpu_qlen, uint, 0); MODULE_PARM_DESC(cryptd_max_cpu_qlen, "Set cryptd Max queue depth"); static struct workqueue_struct *cryptd_wq; struct cryptd_cpu_queue { struct crypto_queue queue; struct work_struct work; }; struct cryptd_queue { /* * Protected by disabling BH to allow enqueueing from softinterrupt and * dequeuing from kworker (cryptd_queue_worker()). */ struct cryptd_cpu_queue __percpu *cpu_queue; }; struct cryptd_instance_ctx { struct crypto_spawn spawn; struct cryptd_queue *queue; }; struct skcipherd_instance_ctx { struct crypto_skcipher_spawn spawn; struct cryptd_queue *queue; }; struct hashd_instance_ctx { struct crypto_shash_spawn spawn; struct cryptd_queue *queue; }; struct aead_instance_ctx { struct crypto_aead_spawn aead_spawn; struct cryptd_queue *queue; }; struct cryptd_skcipher_ctx { refcount_t refcnt; struct crypto_skcipher *child; }; struct cryptd_skcipher_request_ctx { struct skcipher_request req; }; struct cryptd_hash_ctx { refcount_t refcnt; struct crypto_shash *child; }; struct cryptd_hash_request_ctx { crypto_completion_t complete; void *data; struct shash_desc desc; }; struct cryptd_aead_ctx { refcount_t refcnt; struct crypto_aead *child; }; struct cryptd_aead_request_ctx { struct aead_request req; }; static void cryptd_queue_worker(struct work_struct *work); static int cryptd_init_queue(struct cryptd_queue *queue, unsigned int max_cpu_qlen) { int cpu; struct cryptd_cpu_queue *cpu_queue; queue->cpu_queue = alloc_percpu(struct cryptd_cpu_queue); if (!queue->cpu_queue) return -ENOMEM; for_each_possible_cpu(cpu) { cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu); crypto_init_queue(&cpu_queue->queue, max_cpu_qlen); INIT_WORK(&cpu_queue->work, cryptd_queue_worker); } pr_info("cryptd: max_cpu_qlen set to %d\n", max_cpu_qlen); return 0; } static void cryptd_fini_queue(struct cryptd_queue *queue) { int cpu; struct cryptd_cpu_queue *cpu_queue; for_each_possible_cpu(cpu) { cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu); BUG_ON(cpu_queue->queue.qlen); } free_percpu(queue->cpu_queue); } static int cryptd_enqueue_request(struct cryptd_queue *queue, struct crypto_async_request *request) { int err; struct cryptd_cpu_queue *cpu_queue; refcount_t *refcnt; local_bh_disable(); cpu_queue = this_cpu_ptr(queue->cpu_queue); err = crypto_enqueue_request(&cpu_queue->queue, request); refcnt = crypto_tfm_ctx(request->tfm); if (err == -ENOSPC) goto out; queue_work_on(smp_processor_id(), cryptd_wq, &cpu_queue->work); if (!refcount_read(refcnt)) goto out; refcount_inc(refcnt); out: local_bh_enable(); return err; } /* Called in workqueue context, do one real cryption work (via * req->complete) and reschedule itself if there are more work to * do. */ static void cryptd_queue_worker(struct work_struct *work) { struct cryptd_cpu_queue *cpu_queue; struct crypto_async_request *req, *backlog; cpu_queue = container_of(work, struct cryptd_cpu_queue, work); /* * Only handle one request at a time to avoid hogging crypto workqueue. */ local_bh_disable(); backlog = crypto_get_backlog(&cpu_queue->queue); req = crypto_dequeue_request(&cpu_queue->queue); local_bh_enable(); if (!req) return; if (backlog) crypto_request_complete(backlog, -EINPROGRESS); crypto_request_complete(req, 0); if (cpu_queue->queue.qlen) queue_work(cryptd_wq, &cpu_queue->work); } static inline struct cryptd_queue *cryptd_get_queue(struct crypto_tfm *tfm) { struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); struct cryptd_instance_ctx *ictx = crypto_instance_ctx(inst); return ictx->queue; } static void cryptd_type_and_mask(struct crypto_attr_type *algt, u32 *type, u32 *mask) { /* * cryptd is allowed to wrap internal algorithms, but in that case the * resulting cryptd instance will be marked as internal as well. */ *type = algt->type & CRYPTO_ALG_INTERNAL; *mask = algt->mask & CRYPTO_ALG_INTERNAL; /* No point in cryptd wrapping an algorithm that's already async. */ *mask |= CRYPTO_ALG_ASYNC; *mask |= crypto_algt_inherited_mask(algt); } static int cryptd_init_instance(struct crypto_instance *inst, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "cryptd(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; memcpy(inst->alg.cra_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); inst->alg.cra_priority = alg->cra_priority + 50; inst->alg.cra_blocksize = alg->cra_blocksize; inst->alg.cra_alignmask = alg->cra_alignmask; return 0; } static int cryptd_skcipher_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child = ctx->child; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, keylen); } static struct skcipher_request *cryptd_skcipher_prepare( struct skcipher_request *req, int err) { struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->req; struct cryptd_skcipher_ctx *ctx; struct crypto_skcipher *child; req->base.complete = subreq->base.complete; req->base.data = subreq->base.data; if (unlikely(err == -EINPROGRESS)) return NULL; ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); child = ctx->child; skcipher_request_set_tfm(subreq, child); skcipher_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, req->iv); return subreq; } static void cryptd_skcipher_complete(struct skcipher_request *req, int err, crypto_completion_t complete) { struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_request *subreq = &rctx->req; int refcnt = refcount_read(&ctx->refcnt); local_bh_disable(); skcipher_request_complete(req, err); local_bh_enable(); if (unlikely(err == -EINPROGRESS)) { subreq->base.complete = req->base.complete; subreq->base.data = req->base.data; req->base.complete = complete; req->base.data = req; } else if (refcnt && refcount_dec_and_test(&ctx->refcnt)) crypto_free_skcipher(tfm); } static void cryptd_skcipher_encrypt(void *data, int err) { struct skcipher_request *req = data; struct skcipher_request *subreq; subreq = cryptd_skcipher_prepare(req, err); if (likely(subreq)) err = crypto_skcipher_encrypt(subreq); cryptd_skcipher_complete(req, err, cryptd_skcipher_encrypt); } static void cryptd_skcipher_decrypt(void *data, int err) { struct skcipher_request *req = data; struct skcipher_request *subreq; subreq = cryptd_skcipher_prepare(req, err); if (likely(subreq)) err = crypto_skcipher_decrypt(subreq); cryptd_skcipher_complete(req, err, cryptd_skcipher_decrypt); } static int cryptd_skcipher_enqueue(struct skcipher_request *req, crypto_completion_t compl) { struct cryptd_skcipher_request_ctx *rctx = skcipher_request_ctx(req); struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_request *subreq = &rctx->req; struct cryptd_queue *queue; queue = cryptd_get_queue(crypto_skcipher_tfm(tfm)); subreq->base.complete = req->base.complete; subreq->base.data = req->base.data; req->base.complete = compl; req->base.data = req; return cryptd_enqueue_request(queue, &req->base); } static int cryptd_skcipher_encrypt_enqueue(struct skcipher_request *req) { return cryptd_skcipher_enqueue(req, cryptd_skcipher_encrypt); } static int cryptd_skcipher_decrypt_enqueue(struct skcipher_request *req) { return cryptd_skcipher_enqueue(req, cryptd_skcipher_decrypt); } static int cryptd_skcipher_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct skcipherd_instance_ctx *ictx = skcipher_instance_ctx(inst); struct crypto_skcipher_spawn *spawn = &ictx->spawn; struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *cipher; cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_skcipher_set_reqsize( tfm, sizeof(struct cryptd_skcipher_request_ctx) + crypto_skcipher_reqsize(cipher)); return 0; } static void cryptd_skcipher_exit_tfm(struct crypto_skcipher *tfm) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); crypto_free_skcipher(ctx->child); } static void cryptd_skcipher_free(struct skcipher_instance *inst) { struct skcipherd_instance_ctx *ctx = skcipher_instance_ctx(inst); crypto_drop_skcipher(&ctx->spawn); kfree(inst); } static int cryptd_create_skcipher(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt, struct cryptd_queue *queue) { struct skcipherd_instance_ctx *ctx; struct skcipher_instance *inst; struct skcipher_alg_common *alg; u32 type; u32 mask; int err; cryptd_type_and_mask(algt, &type, &mask); inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = skcipher_instance_ctx(inst); ctx->queue = queue; err = crypto_grab_skcipher(&ctx->spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), type, mask); if (err) goto err_free_inst; alg = crypto_spawn_skcipher_alg_common(&ctx->spawn); err = cryptd_init_instance(skcipher_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC | (alg->base.cra_flags & CRYPTO_ALG_INTERNAL); inst->alg.ivsize = alg->ivsize; inst->alg.chunksize = alg->chunksize; inst->alg.min_keysize = alg->min_keysize; inst->alg.max_keysize = alg->max_keysize; inst->alg.base.cra_ctxsize = sizeof(struct cryptd_skcipher_ctx); inst->alg.init = cryptd_skcipher_init_tfm; inst->alg.exit = cryptd_skcipher_exit_tfm; inst->alg.setkey = cryptd_skcipher_setkey; inst->alg.encrypt = cryptd_skcipher_encrypt_enqueue; inst->alg.decrypt = cryptd_skcipher_decrypt_enqueue; inst->free = cryptd_skcipher_free; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: cryptd_skcipher_free(inst); } return err; } static int cryptd_hash_init_tfm(struct crypto_ahash *tfm) { struct ahash_instance *inst = ahash_alg_instance(tfm); struct hashd_instance_ctx *ictx = ahash_instance_ctx(inst); struct crypto_shash_spawn *spawn = &ictx->spawn; struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_shash *hash; hash = crypto_spawn_shash(spawn); if (IS_ERR(hash)) return PTR_ERR(hash); ctx->child = hash; crypto_ahash_set_reqsize(tfm, sizeof(struct cryptd_hash_request_ctx) + crypto_shash_descsize(hash)); return 0; } static int cryptd_hash_clone_tfm(struct crypto_ahash *ntfm, struct crypto_ahash *tfm) { struct cryptd_hash_ctx *nctx = crypto_ahash_ctx(ntfm); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_shash *hash; hash = crypto_clone_shash(ctx->child); if (IS_ERR(hash)) return PTR_ERR(hash); nctx->child = hash; return 0; } static void cryptd_hash_exit_tfm(struct crypto_ahash *tfm) { struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); crypto_free_shash(ctx->child); } static int cryptd_hash_setkey(struct crypto_ahash *parent, const u8 *key, unsigned int keylen) { struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(parent); struct crypto_shash *child = ctx->child; crypto_shash_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_shash_set_flags(child, crypto_ahash_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_shash_setkey(child, key, keylen); } static int cryptd_hash_enqueue(struct ahash_request *req, crypto_completion_t compl) { struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_queue *queue = cryptd_get_queue(crypto_ahash_tfm(tfm)); rctx->complete = req->base.complete; rctx->data = req->base.data; req->base.complete = compl; req->base.data = req; return cryptd_enqueue_request(queue, &req->base); } static struct shash_desc *cryptd_hash_prepare(struct ahash_request *req, int err) { struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); req->base.complete = rctx->complete; req->base.data = rctx->data; if (unlikely(err == -EINPROGRESS)) return NULL; return &rctx->desc; } static void cryptd_hash_complete(struct ahash_request *req, int err, crypto_completion_t complete) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); int refcnt = refcount_read(&ctx->refcnt); local_bh_disable(); ahash_request_complete(req, err); local_bh_enable(); if (err == -EINPROGRESS) { req->base.complete = complete; req->base.data = req; } else if (refcnt && refcount_dec_and_test(&ctx->refcnt)) crypto_free_ahash(tfm); } static void cryptd_hash_init(void *data, int err) { struct ahash_request *req = data; struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_shash *child = ctx->child; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); if (unlikely(!desc)) goto out; desc->tfm = child; err = crypto_shash_init(desc); out: cryptd_hash_complete(req, err, cryptd_hash_init); } static int cryptd_hash_init_enqueue(struct ahash_request *req) { return cryptd_hash_enqueue(req, cryptd_hash_init); } static void cryptd_hash_update(void *data, int err) { struct ahash_request *req = data; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); if (likely(desc)) err = shash_ahash_update(req, desc); cryptd_hash_complete(req, err, cryptd_hash_update); } static int cryptd_hash_update_enqueue(struct ahash_request *req) { return cryptd_hash_enqueue(req, cryptd_hash_update); } static void cryptd_hash_final(void *data, int err) { struct ahash_request *req = data; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); if (likely(desc)) err = crypto_shash_final(desc, req->result); cryptd_hash_complete(req, err, cryptd_hash_final); } static int cryptd_hash_final_enqueue(struct ahash_request *req) { return cryptd_hash_enqueue(req, cryptd_hash_final); } static void cryptd_hash_finup(void *data, int err) { struct ahash_request *req = data; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); if (likely(desc)) err = shash_ahash_finup(req, desc); cryptd_hash_complete(req, err, cryptd_hash_finup); } static int cryptd_hash_finup_enqueue(struct ahash_request *req) { return cryptd_hash_enqueue(req, cryptd_hash_finup); } static void cryptd_hash_digest(void *data, int err) { struct ahash_request *req = data; struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); struct crypto_shash *child = ctx->child; struct shash_desc *desc; desc = cryptd_hash_prepare(req, err); if (unlikely(!desc)) goto out; desc->tfm = child; err = shash_ahash_digest(req, desc); out: cryptd_hash_complete(req, err, cryptd_hash_digest); } static int cryptd_hash_digest_enqueue(struct ahash_request *req) { return cryptd_hash_enqueue(req, cryptd_hash_digest); } static int cryptd_hash_export(struct ahash_request *req, void *out) { struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); return crypto_shash_export(&rctx->desc, out); } static int cryptd_hash_import(struct ahash_request *req, const void *in) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm); struct shash_desc *desc = cryptd_shash_desc(req); desc->tfm = ctx->child; return crypto_shash_import(desc, in); } static void cryptd_hash_free(struct ahash_instance *inst) { struct hashd_instance_ctx *ctx = ahash_instance_ctx(inst); crypto_drop_shash(&ctx->spawn); kfree(inst); } static int cryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt, struct cryptd_queue *queue) { struct hashd_instance_ctx *ctx; struct ahash_instance *inst; struct shash_alg *alg; u32 type; u32 mask; int err; cryptd_type_and_mask(algt, &type, &mask); inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = ahash_instance_ctx(inst); ctx->queue = queue; err = crypto_grab_shash(&ctx->spawn, ahash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), type, mask); if (err) goto err_free_inst; alg = crypto_spawn_shash_alg(&ctx->spawn); err = cryptd_init_instance(ahash_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.halg.base.cra_flags |= CRYPTO_ALG_ASYNC | (alg->base.cra_flags & (CRYPTO_ALG_INTERNAL| CRYPTO_ALG_OPTIONAL_KEY)); inst->alg.halg.digestsize = alg->digestsize; inst->alg.halg.statesize = alg->statesize; inst->alg.halg.base.cra_ctxsize = sizeof(struct cryptd_hash_ctx); inst->alg.init_tfm = cryptd_hash_init_tfm; inst->alg.clone_tfm = cryptd_hash_clone_tfm; inst->alg.exit_tfm = cryptd_hash_exit_tfm; inst->alg.init = cryptd_hash_init_enqueue; inst->alg.update = cryptd_hash_update_enqueue; inst->alg.final = cryptd_hash_final_enqueue; inst->alg.finup = cryptd_hash_finup_enqueue; inst->alg.export = cryptd_hash_export; inst->alg.import = cryptd_hash_import; if (crypto_shash_alg_has_setkey(alg)) inst->alg.setkey = cryptd_hash_setkey; inst->alg.digest = cryptd_hash_digest_enqueue; inst->free = cryptd_hash_free; err = ahash_register_instance(tmpl, inst); if (err) { err_free_inst: cryptd_hash_free(inst); } return err; } static int cryptd_aead_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct cryptd_aead_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; return crypto_aead_setkey(child, key, keylen); } static int cryptd_aead_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct cryptd_aead_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; return crypto_aead_setauthsize(child, authsize); } static void cryptd_aead_crypt(struct aead_request *req, struct crypto_aead *child, int err, int (*crypt)(struct aead_request *req), crypto_completion_t compl) { struct cryptd_aead_request_ctx *rctx; struct aead_request *subreq; struct cryptd_aead_ctx *ctx; struct crypto_aead *tfm; int refcnt; rctx = aead_request_ctx(req); subreq = &rctx->req; req->base.complete = subreq->base.complete; req->base.data = subreq->base.data; tfm = crypto_aead_reqtfm(req); if (unlikely(err == -EINPROGRESS)) goto out; aead_request_set_tfm(subreq, child); aead_request_set_callback(subreq, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); aead_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(subreq, req->assoclen); err = crypt(subreq); out: ctx = crypto_aead_ctx(tfm); refcnt = refcount_read(&ctx->refcnt); local_bh_disable(); aead_request_complete(req, err); local_bh_enable(); if (err == -EINPROGRESS) { subreq->base.complete = req->base.complete; subreq->base.data = req->base.data; req->base.complete = compl; req->base.data = req; } else if (refcnt && refcount_dec_and_test(&ctx->refcnt)) crypto_free_aead(tfm); } static void cryptd_aead_encrypt(void *data, int err) { struct aead_request *req = data; struct cryptd_aead_ctx *ctx; struct crypto_aead *child; ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); child = ctx->child; cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->encrypt, cryptd_aead_encrypt); } static void cryptd_aead_decrypt(void *data, int err) { struct aead_request *req = data; struct cryptd_aead_ctx *ctx; struct crypto_aead *child; ctx = crypto_aead_ctx(crypto_aead_reqtfm(req)); child = ctx->child; cryptd_aead_crypt(req, child, err, crypto_aead_alg(child)->decrypt, cryptd_aead_decrypt); } static int cryptd_aead_enqueue(struct aead_request *req, crypto_completion_t compl) { struct cryptd_aead_request_ctx *rctx = aead_request_ctx(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_queue *queue = cryptd_get_queue(crypto_aead_tfm(tfm)); struct aead_request *subreq = &rctx->req; subreq->base.complete = req->base.complete; subreq->base.data = req->base.data; req->base.complete = compl; req->base.data = req; return cryptd_enqueue_request(queue, &req->base); } static int cryptd_aead_encrypt_enqueue(struct aead_request *req) { return cryptd_aead_enqueue(req, cryptd_aead_encrypt ); } static int cryptd_aead_decrypt_enqueue(struct aead_request *req) { return cryptd_aead_enqueue(req, cryptd_aead_decrypt ); } static int cryptd_aead_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct aead_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_aead_spawn *spawn = &ictx->aead_spawn; struct cryptd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *cipher; cipher = crypto_spawn_aead(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_aead_set_reqsize( tfm, sizeof(struct cryptd_aead_request_ctx) + crypto_aead_reqsize(cipher)); return 0; } static void cryptd_aead_exit_tfm(struct crypto_aead *tfm) { struct cryptd_aead_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void cryptd_aead_free(struct aead_instance *inst) { struct aead_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_aead(&ctx->aead_spawn); kfree(inst); } static int cryptd_create_aead(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt, struct cryptd_queue *queue) { struct aead_instance_ctx *ctx; struct aead_instance *inst; struct aead_alg *alg; u32 type; u32 mask; int err; cryptd_type_and_mask(algt, &type, &mask); inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); ctx->queue = queue; err = crypto_grab_aead(&ctx->aead_spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), type, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(&ctx->aead_spawn); err = cryptd_init_instance(aead_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC | (alg->base.cra_flags & CRYPTO_ALG_INTERNAL); inst->alg.base.cra_ctxsize = sizeof(struct cryptd_aead_ctx); inst->alg.ivsize = crypto_aead_alg_ivsize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.init = cryptd_aead_init_tfm; inst->alg.exit = cryptd_aead_exit_tfm; inst->alg.setkey = cryptd_aead_setkey; inst->alg.setauthsize = cryptd_aead_setauthsize; inst->alg.encrypt = cryptd_aead_encrypt_enqueue; inst->alg.decrypt = cryptd_aead_decrypt_enqueue; inst->free = cryptd_aead_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: cryptd_aead_free(inst); } return err; } static struct cryptd_queue queue; static int cryptd_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_LSKCIPHER: return cryptd_create_skcipher(tmpl, tb, algt, &queue); case CRYPTO_ALG_TYPE_HASH: return cryptd_create_hash(tmpl, tb, algt, &queue); case CRYPTO_ALG_TYPE_AEAD: return cryptd_create_aead(tmpl, tb, algt, &queue); } return -EINVAL; } static struct crypto_template cryptd_tmpl = { .name = "cryptd", .create = cryptd_create, .module = THIS_MODULE, }; struct cryptd_skcipher *cryptd_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { char cryptd_alg_name[CRYPTO_MAX_ALG_NAME]; struct cryptd_skcipher_ctx *ctx; struct crypto_skcipher *tfm; if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME, "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME) return ERR_PTR(-EINVAL); tfm = crypto_alloc_skcipher(cryptd_alg_name, type, mask); if (IS_ERR(tfm)) return ERR_CAST(tfm); if (tfm->base.__crt_alg->cra_module != THIS_MODULE) { crypto_free_skcipher(tfm); return ERR_PTR(-EINVAL); } ctx = crypto_skcipher_ctx(tfm); refcount_set(&ctx->refcnt, 1); return container_of(tfm, struct cryptd_skcipher, base); } EXPORT_SYMBOL_GPL(cryptd_alloc_skcipher); struct crypto_skcipher *cryptd_skcipher_child(struct cryptd_skcipher *tfm) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(&tfm->base); return ctx->child; } EXPORT_SYMBOL_GPL(cryptd_skcipher_child); bool cryptd_skcipher_queued(struct cryptd_skcipher *tfm) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(&tfm->base); return refcount_read(&ctx->refcnt) - 1; } EXPORT_SYMBOL_GPL(cryptd_skcipher_queued); void cryptd_free_skcipher(struct cryptd_skcipher *tfm) { struct cryptd_skcipher_ctx *ctx = crypto_skcipher_ctx(&tfm->base); if (refcount_dec_and_test(&ctx->refcnt)) crypto_free_skcipher(&tfm->base); } EXPORT_SYMBOL_GPL(cryptd_free_skcipher); struct cryptd_ahash *cryptd_alloc_ahash(const char *alg_name, u32 type, u32 mask) { char cryptd_alg_name[CRYPTO_MAX_ALG_NAME]; struct cryptd_hash_ctx *ctx; struct crypto_ahash *tfm; if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME, "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME) return ERR_PTR(-EINVAL); tfm = crypto_alloc_ahash(cryptd_alg_name, type, mask); if (IS_ERR(tfm)) return ERR_CAST(tfm); if (tfm->base.__crt_alg->cra_module != THIS_MODULE) { crypto_free_ahash(tfm); return ERR_PTR(-EINVAL); } ctx = crypto_ahash_ctx(tfm); refcount_set(&ctx->refcnt, 1); return __cryptd_ahash_cast(tfm); } EXPORT_SYMBOL_GPL(cryptd_alloc_ahash); struct crypto_shash *cryptd_ahash_child(struct cryptd_ahash *tfm) { struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(&tfm->base); return ctx->child; } EXPORT_SYMBOL_GPL(cryptd_ahash_child); struct shash_desc *cryptd_shash_desc(struct ahash_request *req) { struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); return &rctx->desc; } EXPORT_SYMBOL_GPL(cryptd_shash_desc); bool cryptd_ahash_queued(struct cryptd_ahash *tfm) { struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(&tfm->base); return refcount_read(&ctx->refcnt) - 1; } EXPORT_SYMBOL_GPL(cryptd_ahash_queued); void cryptd_free_ahash(struct cryptd_ahash *tfm) { struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(&tfm->base); if (refcount_dec_and_test(&ctx->refcnt)) crypto_free_ahash(&tfm->base); } EXPORT_SYMBOL_GPL(cryptd_free_ahash); struct cryptd_aead *cryptd_alloc_aead(const char *alg_name, u32 type, u32 mask) { char cryptd_alg_name[CRYPTO_MAX_ALG_NAME]; struct cryptd_aead_ctx *ctx; struct crypto_aead *tfm; if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME, "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME) return ERR_PTR(-EINVAL); tfm = crypto_alloc_aead(cryptd_alg_name, type, mask); if (IS_ERR(tfm)) return ERR_CAST(tfm); if (tfm->base.__crt_alg->cra_module != THIS_MODULE) { crypto_free_aead(tfm); return ERR_PTR(-EINVAL); } ctx = crypto_aead_ctx(tfm); refcount_set(&ctx->refcnt, 1); return __cryptd_aead_cast(tfm); } EXPORT_SYMBOL_GPL(cryptd_alloc_aead); struct crypto_aead *cryptd_aead_child(struct cryptd_aead *tfm) { struct cryptd_aead_ctx *ctx; ctx = crypto_aead_ctx(&tfm->base); return ctx->child; } EXPORT_SYMBOL_GPL(cryptd_aead_child); bool cryptd_aead_queued(struct cryptd_aead *tfm) { struct cryptd_aead_ctx *ctx = crypto_aead_ctx(&tfm->base); return refcount_read(&ctx->refcnt) - 1; } EXPORT_SYMBOL_GPL(cryptd_aead_queued); void cryptd_free_aead(struct cryptd_aead *tfm) { struct cryptd_aead_ctx *ctx = crypto_aead_ctx(&tfm->base); if (refcount_dec_and_test(&ctx->refcnt)) crypto_free_aead(&tfm->base); } EXPORT_SYMBOL_GPL(cryptd_free_aead); static int __init cryptd_init(void) { int err; cryptd_wq = alloc_workqueue("cryptd", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1); if (!cryptd_wq) return -ENOMEM; err = cryptd_init_queue(&queue, cryptd_max_cpu_qlen); if (err) goto err_destroy_wq; err = crypto_register_template(&cryptd_tmpl); if (err) goto err_fini_queue; return 0; err_fini_queue: cryptd_fini_queue(&queue); err_destroy_wq: destroy_workqueue(cryptd_wq); return err; } static void __exit cryptd_exit(void) { destroy_workqueue(cryptd_wq); cryptd_fini_queue(&queue); crypto_unregister_template(&cryptd_tmpl); } subsys_initcall(cryptd_init); module_exit(cryptd_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Software async crypto daemon"); MODULE_ALIAS_CRYPTO("cryptd");
10 10 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __MAC802154_DRIVER_OPS #define __MAC802154_DRIVER_OPS #include <linux/types.h> #include <linux/rtnetlink.h> #include <net/mac802154.h> #include "ieee802154_i.h" #include "trace.h" static inline int drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb) { return local->ops->xmit_async(&local->hw, skb); } static inline int drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) { might_sleep(); return local->ops->xmit_sync(&local->hw, skb); } static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.pan_id = pan_id; trace_802154_drv_set_pan_id(local, pan_id); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANID_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.ieee_addr = extended_addr; trace_802154_drv_set_extended_addr(local, extended_addr); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_IEEEADDR_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.short_addr = short_addr; trace_802154_drv_set_short_addr(local, short_addr); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_SADDR_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) { struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } filt.pan_coord = is_coord; trace_802154_drv_set_pan_coord(local, is_coord); ret = local->ops->set_hw_addr_filt(&local->hw, &filt, IEEE802154_AFILT_PANC_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) { int ret; might_sleep(); if (!local->ops->set_promiscuous_mode) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_promiscuous_mode(local, on); ret = local->ops->set_promiscuous_mode(&local->hw, on); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_start(struct ieee802154_local *local, enum ieee802154_filtering_level level, const struct ieee802154_hw_addr_filt *addr_filt) { int ret; might_sleep(); /* setup receive mode parameters e.g. address mode */ if (local->hw.flags & IEEE802154_HW_AFILT) { ret = drv_set_pan_id(local, addr_filt->pan_id); if (ret < 0) return ret; ret = drv_set_short_addr(local, addr_filt->short_addr); if (ret < 0) return ret; ret = drv_set_extended_addr(local, addr_filt->ieee_addr); if (ret < 0) return ret; } switch (level) { case IEEE802154_FILTERING_NONE: fallthrough; case IEEE802154_FILTERING_1_FCS: fallthrough; case IEEE802154_FILTERING_2_PROMISCUOUS: /* TODO: Requires a different receive mode setup e.g. * at86rf233 hardware. */ fallthrough; case IEEE802154_FILTERING_3_SCAN: if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { ret = drv_set_promiscuous_mode(local, true); if (ret < 0) return ret; } else { return -EOPNOTSUPP; } /* In practice other filtering levels can be requested, but as * for now most hardware/drivers only support * IEEE802154_FILTERING_NONE, we fallback to this actual * filtering level in hardware and make our own additional * filtering in mac802154 receive path. * * TODO: Move this logic to the device drivers as hardware may * support more higher level filters. Hardware may also require * a different order how register are set, which could currently * be buggy, so all received parameters need to be moved to the * start() callback and let the driver go into the mode before * it will turn on receive handling. */ local->phy->filtering = IEEE802154_FILTERING_NONE; break; case IEEE802154_FILTERING_4_FRAME_FIELDS: /* Do not error out if IEEE802154_HW_PROMISCUOUS because we * expect the hardware to operate at the level * IEEE802154_FILTERING_4_FRAME_FIELDS anyway. */ if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { ret = drv_set_promiscuous_mode(local, false); if (ret < 0) return ret; } local->phy->filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; break; default: WARN_ON(1); return -EINVAL; } trace_802154_drv_start(local); local->started = true; smp_mb(); ret = local->ops->start(&local->hw); trace_802154_drv_return_int(local, ret); return ret; } static inline void drv_stop(struct ieee802154_local *local) { might_sleep(); trace_802154_drv_stop(local); local->ops->stop(&local->hw); trace_802154_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); tasklet_enable(&local->tasklet); barrier(); local->started = false; } static inline int drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) { int ret; might_sleep(); trace_802154_drv_set_channel(local, page, channel); ret = local->ops->set_channel(&local->hw, page, channel); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { int ret; might_sleep(); if (!local->ops->set_txpower) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_tx_power(local, mbm); ret = local->ops->set_txpower(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_cca_mode(struct ieee802154_local *local, const struct wpan_phy_cca *cca) { int ret; might_sleep(); if (!local->ops->set_cca_mode) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_cca_mode(local, cca); ret = local->ops->set_cca_mode(&local->hw, cca); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) { int ret; might_sleep(); if (!local->ops->set_lbt) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_lbt_mode(local, mode); ret = local->ops->set_lbt(&local->hw, mode); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { int ret; might_sleep(); if (!local->ops->set_cca_ed_level) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_cca_ed_level(local, mbm); ret = local->ops->set_cca_ed_level(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } static inline int drv_set_csma_params(struct ieee802154_local *local, u8 min_be, u8 max_be, u8 max_csma_backoffs) { int ret; might_sleep(); if (!local->ops->set_csma_params) { WARN_ON(1); return -EOPNOTSUPP; } trace_802154_drv_set_csma_params(local, min_be, max_be, max_csma_backoffs); ret = local->ops->set_