Total coverage: 234595 (15%)of 1575468
5 716 93 631 37 81 642 177 269 124 286 79 93 35 2 263 7 7 92 12 9 1 13 4 12 12 211 209 1 5 3 3 3 40 60 107 106 404 404 404 10 19 19 19 19 19 18 18 18 1 1 14 7 26 26 10 10 28 28 15 2 10 15 93 49 46 17 1 15 1 15 15 16 4 4 4 10 6 4 6 6 5 5 4 5 2 2 1 2 6 1 6 8 8 7 7 4 4 4 1 1 5 14 15 15 1 13 13 2 12 2 2 13 5 5 5 5 5 5 5 10 10 10 3 10 2 10 78 78 3 2 1 3 11 11 1 11 11 1 10 11 53 3 1 53 5 51 53 41 13 52 14 41 11 30 4 37 40 1 41 1 53 42 13 421 66 378 67 68 418 417 419 22 74 405 679 406 459 733 731 734 725 36 26 26 22 4 2 1 1 1 1 4 2 3 4 22 22 22 11 11 2 17 18 26 26 26 26 4 26 26 26 26 26 25 26 26 26 26 1 1 4 22 26 22 4 25 1 26 25 26 26 26 26 29 26 4 7 30 30 31 31 31 30 27 4 31 1 4 4 6 21 4 5 12 13 22 4 4 4 4 22 4 26 26 4 30 27 1 1 2 29 31 25 25 25 25 14 11 25 77 78 78 77 54 30 31 31 30 55 55 5 17 17 17 13 4 11 15 1 1 15 15 10 10 1 9 10 2 8 2 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2009 Red Hat, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> #include <linux/swapops.h> #include <linux/backing-dev.h> #include <linux/dax.h> #include <linux/mm_types.h> #include <linux/khugepaged.h> #include <linux/freezer.h> #include <linux/pfn_t.h> #include <linux/mman.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/migrate.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/page_owner.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> #include <linux/compat.h> #include <linux/pgalloc_tag.h> #include <linux/pagewalk.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" #include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/thp.h> /* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not * guaranteed to benefit from it. When transparent hugepage support is * enabled, it is for all mappings, and khugepaged scans all mappings. * Defrag is invoked by khugepaged hugepage allocations and by page faults * for all hugepage allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS (1<<TRANSPARENT_HUGEPAGE_FLAG)| #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); static struct shrinker *deferred_split_shrinker; static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc); static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc); static bool split_underused_thp = true; static atomic_t huge_zero_refcount; struct folio *huge_zero_folio __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; static bool anon_orders_configured __initdata; static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) return false; if (!vma->vm_file) return false; inode = file_inode(vma->vm_file); return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, unsigned long orders) { bool smaps = tva_flags & TVA_SMAPS; bool in_pf = tva_flags & TVA_IN_PF; bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) supported_orders = THP_ORDERS_ALL_ANON; else if (vma_is_special_huge(vma)) supported_orders = THP_ORDERS_ALL_SPECIAL; else supported_orders = THP_ORDERS_ALL_FILE_DEFAULT; orders &= supported_orders; if (!orders) return 0; if (!vma->vm_mm) /* vdso */ return 0; if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ if (vma_is_dax(vma)) return in_pf ? orders : 0; /* * khugepaged special VMA and hugetlb VMA. * Must be checked after dax since some dax mappings may have * VM_MIXEDMAP set. */ if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) return 0; /* * Check alignment for file vma and size for both file and anon vma by * filtering out the unsuitable orders. * * Skip the check for page fault. Huge fault does the check in fault * handlers. */ if (!in_pf) { int order = highest_order(orders); unsigned long addr; while (orders) { addr = vma->vm_end - (PAGE_SIZE << order); if (thp_vma_suitable_order(vma, addr, order)) break; order = next_order(&orders, order); } if (!orders) return 0; } /* * Enabled via shmem mount options or sysfs settings. * Must be done before hugepage flags check since shmem has its * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) return shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, 0, !enforce_sysfs); if (!vma_is_anonymous(vma)) { /* * Enforce sysfs THP requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders(). */ if (enforce_sysfs && (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && !hugepage_global_always()))) return 0; /* * Trust that ->huge_fault() handlers know what they are doing * in fault path. */ if (((in_pf || smaps)) && vma->vm_ops->huge_fault) return orders; /* Only regular file is valid in collapse path */ if (((!in_pf || smaps)) && file_thp_enabled(vma)) return orders; return 0; } if (vma_is_temporary_stack(vma)) return 0; /* * THPeligible bit of smaps should show 1 for proper VMAs even * though anon_vma is not initialized yet. * * Allow page fault since anon_vma may be not initialized until * the first page fault. */ if (!vma->anon_vma) return (smaps || in_pf) ? orders : 0; return orders; } static bool get_huge_zero_page(void) { struct folio *zero_folio; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) return true; zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_folio) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } /* Ensure zero folio won't have large_rmappable flag set. */ folio_clear_large_rmappable(zero_folio); preempt_disable(); if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) { preempt_enable(); folio_put(zero_folio); goto retry; } WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); count_vm_event(THP_ZERO_PAGE_ALLOC); return true; } static void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put * last reference. */ BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_page()) return NULL; if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); return READ_ONCE(huge_zero_folio); } void mm_put_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) put_huge_zero_page(); } static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc) { /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; } static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct folio *zero_folio = xchg(&huge_zero_folio, NULL); BUG_ON(zero_folio == NULL); WRITE_ONCE(huge_zero_pfn, ~0UL); folio_put(zero_folio); return HPAGE_PMD_NR; } return 0; } static struct shrinker *huge_zero_page_shrinker; #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) output = "[always] madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) output = "always [madvise] never"; else output = "always madvise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret = count; if (sysfs_streq(buf, "always")) { clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); } else ret = -EINVAL; if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } return ret; } static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag) { return sysfs_emit(buf, "%d\n", !!test_bit(flag, &transparent_hugepage_flags)); } ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag) { unsigned long value; int ret; ret = kstrtoul(buf, 10, &value); if (ret < 0) return ret; if (value > 1) return -EINVAL; if (value) set_bit(flag, &transparent_hugepage_flags); else clear_bit(flag, &transparent_hugepage_flags); return count; } static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) output = "[always] defer defer+madvise madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) output = "always [defer] defer+madvise madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) output = "always defer [defer+madvise] madvise never"; else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) output = "always defer defer+madvise [madvise] never"; else output = "always defer defer+madvise madvise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (sysfs_streq(buf, "always")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "defer+madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "defer")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); } else if (sysfs_streq(buf, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); } else return -EINVAL; return count; } static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); static ssize_t use_zero_page_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static ssize_t use_zero_page_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); static ssize_t hpage_pmd_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); } static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); static ssize_t split_underused_thp_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", split_underused_thp); } static ssize_t split_underused_thp_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err = kstrtobool(buf, &split_underused_thp); if (err < 0) return err; return count; } static struct kobj_attribute split_underused_thp_attr = __ATTR( shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store); static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, &hpage_pmd_size_attr.attr, #ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif &split_underused_thp_attr.attr, NULL, }; static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; static void hugepage_exit_sysfs(struct kobject *hugepage_kobj); static void thpsize_release(struct kobject *kobj); static DEFINE_SPINLOCK(huge_anon_orders_lock); static LIST_HEAD(thpsize_list); static ssize_t anon_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; if (test_bit(order, &huge_anon_orders_always)) output = "[always] inherit madvise never"; else if (test_bit(order, &huge_anon_orders_inherit)) output = "always [inherit] madvise never"; else if (test_bit(order, &huge_anon_orders_madvise)) output = "always inherit [madvise] never"; else output = "always inherit madvise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t anon_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; if (sysfs_streq(buf, "always")) { spin_lock(&huge_anon_orders_lock); clear_bit(order, &huge_anon_orders_inherit); clear_bit(order, &huge_anon_orders_madvise); set_bit(order, &huge_anon_orders_always); spin_unlock(&huge_anon_orders_lock); } else if (sysfs_streq(buf, "inherit")) { spin_lock(&huge_anon_orders_lock); clear_bit(order, &huge_anon_orders_always); clear_bit(order, &huge_anon_orders_madvise); set_bit(order, &huge_anon_orders_inherit); spin_unlock(&huge_anon_orders_lock); } else if (sysfs_streq(buf, "madvise")) { spin_lock(&huge_anon_orders_lock); clear_bit(order, &huge_anon_orders_always); clear_bit(order, &huge_anon_orders_inherit); set_bit(order, &huge_anon_orders_madvise); spin_unlock(&huge_anon_orders_lock); } else if (sysfs_streq(buf, "never")) { spin_lock(&huge_anon_orders_lock); clear_bit(order, &huge_anon_orders_always); clear_bit(order, &huge_anon_orders_inherit); clear_bit(order, &huge_anon_orders_madvise); spin_unlock(&huge_anon_orders_lock); } else ret = -EINVAL; if (ret > 0) { int err; err = start_stop_khugepaged(); if (err) ret = err; } return ret; } static struct kobj_attribute anon_enabled_attr = __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store); static struct attribute *anon_ctrl_attrs[] = { &anon_enabled_attr.attr, NULL, }; static const struct attribute_group anon_ctrl_attr_grp = { .attrs = anon_ctrl_attrs, }; static struct attribute *file_ctrl_attrs[] = { #ifdef CONFIG_SHMEM &thpsize_shmem_enabled_attr.attr, #endif NULL, }; static const struct attribute_group file_ctrl_attr_grp = { .attrs = file_ctrl_attrs, }; static struct attribute *any_ctrl_attrs[] = { NULL, }; static const struct attribute_group any_ctrl_attr_grp = { .attrs = any_ctrl_attrs, }; static const struct kobj_type thpsize_ktype = { .release = &thpsize_release, .sysfs_ops = &kobj_sysfs_ops, }; DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}}; static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item) { unsigned long sum = 0; int cpu; for_each_possible_cpu(cpu) { struct mthp_stat *this = &per_cpu(mthp_stats, cpu); sum += this->stats[order][item]; } return sum; } #define DEFINE_MTHP_STAT_ATTR(_name, _index) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ int order = to_thpsize(kobj)->order; \ \ return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \ } \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC); DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK); DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE); #endif DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT); DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); static struct attribute *anon_stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, #ifndef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, &swpin_fallback_attr.attr, &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif &split_deferred_attr.attr, &nr_anon_attr.attr, &nr_anon_partially_mapped_attr.attr, NULL, }; static struct attribute_group anon_stats_attr_grp = { .name = "stats", .attrs = anon_stats_attrs, }; static struct attribute *file_stats_attrs[] = { #ifdef CONFIG_SHMEM &shmem_alloc_attr.attr, &shmem_fallback_attr.attr, &shmem_fallback_charge_attr.attr, #endif NULL, }; static struct attribute_group file_stats_attr_grp = { .name = "stats", .attrs = file_stats_attrs, }; static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, &swpin_fallback_attr.attr, &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif &split_attr.attr, &split_failed_attr.attr, NULL, }; static struct attribute_group any_stats_attr_grp = { .name = "stats", .attrs = any_stats_attrs, }; static int sysfs_add_group(struct kobject *kobj, const struct attribute_group *grp) { int ret = -ENOENT; /* * If the group is named, try to merge first, assuming the subdirectory * was already created. This avoids the warning emitted by * sysfs_create_group() if the directory already exists. */ if (grp->name) ret = sysfs_merge_group(kobj, grp); if (ret) ret = sysfs_create_group(kobj, grp); return ret; } static struct thpsize *thpsize_create(int order, struct kobject *parent) { unsigned long size = (PAGE_SIZE << order) / SZ_1K; struct thpsize *thpsize; int ret = -ENOMEM; thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL); if (!thpsize) goto err; thpsize->order = order; ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent, "hugepages-%lukB", size); if (ret) { kfree(thpsize); goto err; } ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp); if (ret) goto err_put; ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp); if (ret) goto err_put; if (BIT(order) & THP_ORDERS_ALL_ANON) { ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp); if (ret) goto err_put; ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp); if (ret) goto err_put; } if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) { ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp); if (ret) goto err_put; ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp); if (ret) goto err_put; } return thpsize; err_put: kobject_put(&thpsize->kobj); err: return ERR_PTR(ret); } static void thpsize_release(struct kobject *kobj) { kfree(to_thpsize(kobj)); } static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) { int err; struct thpsize *thpsize; unsigned long orders; int order; /* * Default to setting PMD-sized THP to inherit the global setting and * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time * constant so we have to do this here. */ if (!anon_orders_configured) huge_anon_orders_inherit = BIT(PMD_ORDER); *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { pr_err("failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group; } orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT; order = highest_order(orders); while (orders) { thpsize = thpsize_create(order, *hugepage_kobj); if (IS_ERR(thpsize)) { pr_err("failed to create thpsize for order %d\n", order); err = PTR_ERR(thpsize); goto remove_all; } list_add(&thpsize->node, &thpsize_list); order = next_order(&orders, order); } return 0; remove_all: hugepage_exit_sysfs(*hugepage_kobj); return err; remove_hp_group: sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); delete_obj: kobject_put(*hugepage_kobj); return err; } static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) { struct thpsize *thpsize, *tmp; list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) { list_del(&thpsize->node); kobject_put(&thpsize->kobj); } sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); kobject_put(hugepage_kobj); } #else static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) { return 0; } static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) { } #endif /* CONFIG_SYSFS */ static int __init thp_shrinker_init(void) { huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); if (!huge_zero_page_shrinker) return -ENOMEM; deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | SHRINKER_NONSLAB, "thp-deferred_split"); if (!deferred_split_shrinker) { shrinker_free(huge_zero_page_shrinker); return -ENOMEM; } huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; shrinker_register(huge_zero_page_shrinker); deferred_split_shrinker->count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; shrinker_register(deferred_split_shrinker); return 0; } static void __init thp_shrinker_exit(void) { shrinker_free(huge_zero_page_shrinker); shrinker_free(deferred_split_shrinker); } static int __init hugepage_init(void) { int err; struct kobject *hugepage_kobj; if (!has_transparent_hugepage()) { transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED; return -EINVAL; } /* * hugepages can't be allocated by the buddy allocator */ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER); err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs; err = khugepaged_init(); if (err) goto err_slab; err = thp_shrinker_init(); if (err) goto err_shrinker; /* * By default disable transparent hugepages on smaller systems, * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { transparent_hugepage_flags = 0; return 0; } err = start_stop_khugepaged(); if (err) goto err_khugepaged; return 0; err_khugepaged: thp_shrinker_exit(); err_shrinker: khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); err_sysfs: return err; } subsys_initcall(hugepage_init); static int __init setup_transparent_hugepage(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "always")) { set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); ret = 1; } else if (!strcmp(str, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); ret = 1; } else if (!strcmp(str, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); ret = 1; } out: if (!ret) pr_warn("transparent_hugepage= cannot parse, ignored\n"); return ret; } __setup("transparent_hugepage=", setup_transparent_hugepage); static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_anon(char *str) { char *token, *range, *policy, *subtoken; unsigned long always, inherit, madvise; char *start_size, *end_size; int start, end, nr; char *p; if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; strscpy(str_dup, str); always = huge_anon_orders_always; madvise = huge_anon_orders_madvise; inherit = huge_anon_orders_inherit; p = str_dup; while ((token = strsep(&p, ";")) != NULL) { range = strsep(&token, ":"); policy = token; if (!policy) goto err; while ((subtoken = strsep(&range, ",")) != NULL) { if (strchr(subtoken, '-')) { start_size = strsep(&subtoken, "-"); end_size = subtoken; start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON); end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON); } else { start_size = end_size = subtoken; start = end = get_order_from_str(subtoken, THP_ORDERS_ALL_ANON); } if (start == -EINVAL) { pr_err("invalid size %s in thp_anon boot parameter\n", start_size); goto err; } if (end == -EINVAL) { pr_err("invalid size %s in thp_anon boot parameter\n", end_size); goto err; } if (start < 0 || end < 0 || start > end) goto err; nr = end - start + 1; if (!strcmp(policy, "always")) { bitmap_set(&always, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); } else if (!strcmp(policy, "madvise")) { bitmap_set(&madvise, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&always, start, nr); } else if (!strcmp(policy, "inherit")) { bitmap_set(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); } else if (!strcmp(policy, "never")) { bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); } else { pr_err("invalid policy %s in thp_anon boot parameter\n", policy); goto err; } } } huge_anon_orders_always = always; huge_anon_orders_madvise = madvise; huge_anon_orders_inherit = inherit; anon_orders_configured = true; return 1; err: pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str); return 0; } __setup("thp_anon=", setup_thp_anon); pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd, vma); return pmd; } #ifdef CONFIG_MEMCG static inline struct deferred_split *get_deferred_split_queue(struct folio *folio) { struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); if (memcg) return &memcg->deferred_split_queue; else return &pgdat->deferred_split_queue; } #else static inline struct deferred_split *get_deferred_split_queue(struct folio *folio) { struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); return &pgdat->deferred_split_queue; } #endif static inline bool is_transparent_hugepage(const struct folio *folio) { if (!folio_test_large(folio)) return false; return is_huge_zero_folio(folio) || folio_test_large_rmappable(folio); } static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, loff_t off, unsigned long flags, unsigned long size, vm_flags_t vm_flags) { loff_t off_end = off + len; loff_t off_align = round_up(off, size); unsigned long len_pad, ret, off_sub; if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) return 0; if (off_end <= off_align || (off_end - off_align) < size) return 0; len_pad = len + size; if (len_pad < len || (off + len_pad) < off) return 0; ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad, off >> PAGE_SHIFT, flags, vm_flags); /* * The failure might be due to length padding. The caller will retry * without the padding. */ if (IS_ERR_VALUE(ret)) return 0; /* * Do not try to align to THP boundary if allocation at the address * hint succeeds. */ if (ret == addr) return addr; off_sub = (off - ret) & (size - 1); if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub) return ret + size; ret += off_sub; return ret; } unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long ret; loff_t off = (loff_t)pgoff << PAGE_SHIFT; ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags); if (ret) return ret; return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags, vm_flags); } unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, unsigned long addr) { gfp_t gfp = vma_thp_gfp_mask(vma); const int order = HPAGE_PMD_ORDER; struct folio *folio; folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); return NULL; } VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); return NULL; } folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation (__GFP_ZERO not used) * or user folios require special handling, folio_zero_user() is used to * make sure that the page corresponding to the faulting address will be * hot in the cache after zeroing. */ if (user_alloc_needs_zeroing()) folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); return folio; } static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, struct vm_area_struct *vma, unsigned long haddr) { pmd_t entry; entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); set_pmd_at(vma->vm_mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, haddr, pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); count_vm_event(THP_FAULT_ALLOC); count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) { unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct vm_area_struct *vma = vmf->vma; struct folio *folio; pgtable_t pgtable; vm_fault_t ret = 0; folio = vma_alloc_anon_folio_pmd(vma, vmf->address); if (unlikely(!folio)) return VM_FAULT_FALLBACK; pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) { ret = VM_FAULT_OOM; goto release; } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { goto unlock_release; } else { ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock_release; /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); folio_put(folio); pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); deferred_split_folio(folio, false); spin_unlock(vmf->ptl); } return 0; unlock_release: spin_unlock(vmf->ptl); release: if (pgtable) pte_free(vma->vm_mm, pgtable); folio_put(folio); return ret; } /* * always: directly stall for all thp allocations * defer: wake kswapd and fail if not immediately available * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise * fail if not immediately available * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately * available * never: never stall for any thp allocation */ gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) { const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); /* Always do synchronous compaction */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); /* Kick kcompactd and fail quickly */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; /* Synchronous compaction if madvised, otherwise kick kcompactd */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : __GFP_KSWAPD_RECLAIM); /* Only do synchronous compaction if madvised */ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); return GFP_TRANSHUGE_LIGHT; } /* Caller must hold page table lock. */ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct folio *zero_folio) { pmd_t entry; if (!pmd_none(*pmd)) return; entry = mk_pmd(&zero_folio->page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return VM_FAULT_FALLBACK; ret = vmf_anon_prepare(vmf); if (ret) return ret; khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; struct folio *zero_folio; vm_fault_t ret; pgtable = pte_alloc_one(vma->vm_mm); if (unlikely(!pgtable)) return VM_FAULT_OOM; zero_folio = mm_get_huge_zero_folio(vma->vm_mm); if (unlikely(!zero_folio)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; if (pmd_none(*vmf->pmd)) { ret = check_stable_address_space(vma->vm_mm); if (ret) { spin_unlock(vmf->ptl); pte_free(vma->vm_mm, pgtable); } else if (userfaultfd_missing(vma)) { spin_unlock(vmf->ptl); pte_free(vma->vm_mm, pgtable); ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_folio(pgtable, vma->vm_mm, vma, haddr, vmf->pmd, zero_folio); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); } } else { spin_unlock(vmf->ptl); pte_free(vma->vm_mm, pgtable); } return ret; } return __do_huge_pmd_anonymous_page(vmf); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; spinlock_t *ptl; ptl = pmd_lock(mm, pmd); if (!pmd_none(*pmd)) { if (write) { if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); goto out_unlock; } entry = pmd_mkyoung(*pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) update_mmu_cache_pmd(vma, addr, pmd); } goto out_unlock; } entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); else entry = pmd_mkspecial(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); } if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); out_unlock: spin_unlock(ptl); if (pgtable) pte_free(mm, pgtable); } /** * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @write: whether it's a write fault * * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { unsigned long addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; /* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; if (arch_needs_pgtable_deposit()) { pgtable = pte_alloc_one(vma->vm_mm); if (!pgtable) return VM_FAULT_OOM; } track_pfn_insert(vma, &pgprot, pfn); insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pud = pud_mkwrite(pud); return pud; } static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, pfn_t pfn, bool write) { struct mm_struct *mm = vma->vm_mm; pgprot_t prot = vma->vm_page_prot; pud_t entry; spinlock_t *ptl; ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { if (write) { if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn))) goto out_unlock; entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) update_mmu_cache_pud(vma, addr, pud); } goto out_unlock; } entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); else entry = pud_mkspecial(entry); if (write) { entry = pud_mkyoung(pud_mkdirty(entry)); entry = maybe_pud_mkwrite(entry, vma); } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); out_unlock: spin_unlock(ptl); } /** * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @write: whether it's a write fault * * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; /* * If we had pud_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that * can't support a 'special' bit. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && !pfn_t_devmap(pfn)); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); insert_pfn_pud(vma, addr, vmf->pud, pfn, write); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write) { pmd_t _pmd; _pmd = pmd_mkyoung(*pmd); if (write) _pmd = pmd_mkdirty(_pmd); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, pmd, _pmd, write)) update_mmu_cache_pmd(vma, addr, pmd); } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; struct page *page; int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) return NULL; if (pmd_present(*pmd) && pmd_devmap(*pmd)) /* pass */; else return NULL; if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); /* * device mapped pages can only be returned if the * caller will manage the page reference count. */ if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; *pgmap = get_dev_pagemap(pfn, *pgmap); if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); return page; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { spinlock_t *dst_ptl, *src_ptl; struct page *src_page; struct folio *src_folio; pmd_t pmd; pgtable_t pgtable = NULL; int ret = -ENOMEM; pmd = pmdp_get_lockless(src_pmd); if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); /* * No need to recheck the pmd, it can't change with write * mmap lock held here. * * Meanwhile, making sure it's not a CoW VMA with writable * mapping, otherwise it means either the anon page wrongly * applied special bit, or we made the PRIVATE mapping be * able to wrongly write to the backend MMIO. */ VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd)); goto set_pmd; } /* Skip if can be re-fill on fault */ if (!vma_is_anonymous(dst_vma)) return 0; pgtable = pte_alloc_one(dst_mm); if (unlikely(!pgtable)) goto out; dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); VM_BUG_ON(!is_pmd_migration_entry(pmd)); if (!is_readable_migration_entry(entry)) { entry = make_readable_migration_entry( swp_offset(entry)); pmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*src_pmd)) pmd = pmd_swp_mksoft_dirty(pmd); if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); if (!userfaultfd_wp(dst_vma)) pmd = pmd_swp_clear_uffd_wp(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; goto out_unlock; } #endif if (unlikely(!pmd_trans_huge(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; } /* * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table. */ if (is_huge_zero_pmd(pmd)) { /* * mm_get_huge_zero_folio() will never allocate a new * folio here, since we already have a zero page to * copy. It just takes a reference. */ mm_get_huge_zero_folio(dst_mm); goto out_zero_page; } src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); src_folio = page_folio(src_page); folio_get(src_folio); if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) { /* Page maybe pinned: split and retry the fault on PTEs. */ folio_put(src_folio); pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); return -EAGAIN; } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); if (!userfaultfd_wp(dst_vma)) pmd = pmd_clear_uffd_wp(pmd); pmd = pmd_wrprotect(pmd); set_pmd: pmd = pmd_mkold(pmd); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; out_unlock: spin_unlock(src_ptl); spin_unlock(dst_ptl); out: return ret; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write) { pud_t _pud; _pud = pud_mkyoung(*pud); if (write) _pud = pud_mkdirty(_pud); if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, pud, _pud, write)) update_mmu_cache_pud(vma, addr, pud); } int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma) { spinlock_t *dst_ptl, *src_ptl; pud_t pud; int ret; dst_ptl = pud_lock(dst_mm, dst_pud); src_ptl = pud_lockptr(src_mm, src_pud); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pud = *src_pud; if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) goto out_unlock; /* * TODO: once we support anonymous pages, use * folio_try_dup_anon_rmap_*() and split if duplicating fails. */ if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) { pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_wrprotect(pud); } pud = pud_mkold(pud); set_pud_at(dst_mm, addr, dst_pud, pud); ret = 0; out_unlock: spin_unlock(src_ptl); spin_unlock(dst_ptl); return ret; } void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); if (unlikely(!pud_same(*vmf->pud, orig_pud))) goto unlock; touch_pud(vmf->vma, vmf->address, vmf->pud, write); unlock: spin_unlock(vmf->ptl); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ void huge_pmd_set_accessed(struct vm_fault *vmf) { bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) goto unlock; touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); unlock: spin_unlock(vmf->ptl); } static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) { unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; struct folio *folio; vm_fault_t ret = 0; folio = vma_alloc_anon_folio_pmd(vma, vmf->address); if (unlikely(!folio)) return VM_FAULT_FALLBACK; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) goto release; ret = check_stable_address_space(vma->vm_mm); if (ret) goto release; (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); goto unlock; release: folio_put(folio); unlock: spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); return ret; } vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) { vm_fault_t ret = do_huge_zero_wp_pmd(vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; /* Fallback to splitting PMD if THP cannot be allocated */ goto fallback; } spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); return 0; } page = pmd_page(orig_pmd); folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; if (!folio_trylock(folio)) { folio_get(folio); spin_unlock(vmf->ptl); folio_lock(folio); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); folio_unlock(folio); folio_put(folio); return 0; } folio_put(folio); } /* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) { folio_unlock(folio); goto reuse; } /* * See do_wp_page(): we can only reuse the folio exclusively if * there are no additional references. Note that we always drain * the LRU cache immediately after adding a THP. */ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; if (folio_test_swapcache(folio)) folio_free_swap(folio); if (folio_ref_count(folio) == 1) { pmd_t entry; folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(page); folio_unlock(folio); reuse: if (unlikely(unshare)) { spin_unlock(vmf->ptl); return 0; } entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); return 0; } unlock_fallback: folio_unlock(folio); spin_unlock(vmf->ptl); fallback: __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } static inline bool can_change_pmd_writable(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { struct page *page; if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) return false; /* Don't touch entries that are not even readable (NUMA hinting). */ if (pmd_protnone(pmd)) return false; /* Do we need write faults for softdirty tracking? */ if (pmd_needs_soft_dirty_wp(vma, pmd)) return false; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_huge_pmd_wp(vma, pmd)) return false; if (!(vma->vm_flags & VM_SHARED)) { /* See can_change_pte_writable(). */ page = vm_normal_page_pmd(vma, addr, pmd); return page && PageAnon(page) && PageAnonExclusive(page); } /* See can_change_pte_writable(). */ return pmd_dirty(pmd); } /* NUMA hinting page fault entry point for trans huge pmds */ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int nid = NUMA_NO_NODE; int target_nid, last_cpupid; pmd_t pmd, old_pmd; bool writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); old_pmd = pmdp_get(vmf->pmd); if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) { spin_unlock(vmf->ptl); return 0; } pmd = pmd_modify(old_pmd, vma->vm_page_prot); /* * Detect now whether the PMD could be writable; this information * is only valid while holding the PT lock. */ writable = pmd_write(pmd); if (!writable && vma_wants_manual_pte_write_upgrade(vma) && can_change_pmd_writable(vma, vmf->address, pmd)) writable = true; folio = vm_normal_folio_pmd(vma, haddr, pmd); if (!folio) goto out_map; nid = folio_nid(folio); target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable, &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { flags |= TNF_MIGRATE_FAIL; goto out_map; } /* The folio is isolated and isolation code holds a folio reference. */ spin_unlock(vmf->ptl); writable = false; if (!migrate_misplaced_folio(folio, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); return 0; } flags |= TNF_MIGRATE_FAIL; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) { spin_unlock(vmf->ptl); return 0; } out_map: /* Restore the PMD */ pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (writable) pmd = pmd_mkwrite(pmd, vma); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); return 0; } /* * Return true if we do MADV_FREE successfully on entire pmd page. * Otherwise, return false. */ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next) { spinlock_t *ptl; pmd_t orig_pmd; struct folio *folio; struct mm_struct *mm = tlb->mm; bool ret = false; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; orig_pmd = *pmd; if (is_huge_zero_pmd(orig_pmd)) goto out; if (unlikely(!pmd_present(orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(orig_pmd)); goto out; } folio = pmd_folio(orig_pmd); /* * If other processes are mapping this folio, we couldn't discard * the folio unless they all do MADV_FREE so let's skip the folio. */ if (folio_likely_mapped_shared(folio)) goto out; if (!folio_trylock(folio)) goto out; /* * If user want to discard part-pages of THP, split it so MADV_FREE * will deactivate only them. */ if (next - addr != HPAGE_PMD_SIZE) { folio_get(folio); spin_unlock(ptl); split_folio(folio); folio_unlock(folio); folio_put(folio); goto out_unlocked; } if (folio_test_dirty(folio)) folio_clear_dirty(folio); folio_unlock(folio); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } folio_mark_lazyfree(folio); ret = true; out: spin_unlock(ptl); out_unlocked: return ret; } static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; /* * For architectures like ppc64 we look at deposited pgtable * when calling pmdp_huge_get_and_clear. So do the * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, tlb->fullmm); arch_check_zapped_pmd(vma, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else if (is_huge_zero_pmd(orig_pmd)) { zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); } else { struct folio *folio = NULL; int flush_needed = 1; if (pmd_present(orig_pmd)) { struct page *page = pmd_page(orig_pmd); folio = page_folio(page); folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { swp_entry_t entry; VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); folio = pfn_swap_entry_folio(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PMD_NR); } spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); } return 1; } #ifndef pmd_move_must_withdraw static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, spinlock_t *old_pmd_ptl, struct vm_area_struct *vma) { /* * With split pmd lock we also need to move preallocated * PTE page table if new_pmd is on different PMD page table. * * We also don't deposit and withdraw tables for file pages. */ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); } #endif static pmd_t move_soft_dirty_pmd(pmd_t pmd) { #ifdef CONFIG_MEM_SOFT_DIRTY if (unlikely(is_pmd_migration_entry(pmd))) pmd = pmd_swp_mksoft_dirty(pmd); else if (pmd_present(pmd)) pmd = pmd_mksoft_dirty(pmd); #endif return pmd; } static pmd_t clear_uffd_wp_pmd(pmd_t pmd) { if (pmd_present(pmd)) pmd = pmd_clear_uffd_wp(pmd); else if (is_swap_pmd(pmd)) pmd = pmd_swp_clear_uffd_wp(pmd); return pmd; } bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; bool force_flush = false; /* * The destination pmd shouldn't be established, free_pgtables() * should have released it; but move_page_tables() might have already * inserted a page table, if racing against shmem/file collapse. */ if (!pmd_none(*new_pmd)) { VM_BUG_ON(pmd_trans_huge(*new_pmd)); return false; } /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = __pmd_trans_huge_lock(old_pmd, vma); if (old_ptl) { new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); if (pmd_present(pmd)) force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd); if (vma_has_uffd_without_event_remap(vma)) pmd = clear_uffd_wp_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } return false; } /* * Returns * - 0 if PMD could not be locked * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary * or if prot_numa but THP migration is not supported * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t oldpmd, entry; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; int ret = 1; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); if (prot_numa && !thp_migration_supported()) return 1; ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so * just be safe and disable write */ if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry(swp_offset(entry)); else entry = make_readable_migration_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); } else { newpmd = *pmd; } if (uffd_wp) newpmd = pmd_swp_mkuffd_wp(newpmd); else if (uffd_wp_resolve) newpmd = pmd_swp_clear_uffd_wp(newpmd); if (!pmd_same(*pmd, newpmd)) set_pmd_at(mm, addr, pmd, newpmd); goto unlock; } #endif if (prot_numa) { struct folio *folio; bool toptier; /* * Avoid trapping faults against the zero page. The read-only * data is likely to be read-cached on the local CPU and * local/remote hits to the zero page are not interesting. */ if (is_huge_zero_pmd(*pmd)) goto unlock; if (pmd_protnone(*pmd)) goto unlock; folio = pmd_folio(*pmd); toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa * balancing is disabled */ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) goto unlock; if (folio_use_access_time(folio)) folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED * which is also under mmap_read_lock(mm): * * CPU0: CPU1: * change_huge_pmd(prot_numa=1) * pmdp_huge_get_and_clear_notify() * madvise_dontneed() * zap_pmd_range() * pmd_trans_huge(*pmd) == 0 (without ptl) * // skip the pmd * set_pmd_at(); * // pmd is re-established * * The race makes MADV_DONTNEED miss the huge pmd and don't clear it * which may break userspace. * * pmdp_invalidate_ad() is required to make sure we don't miss * dirty/young flags set by hardware. */ oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); if (uffd_wp) entry = pmd_mkuffd_wp(entry); else if (uffd_wp_resolve) /* * Leave the write bit to be handled by PF interrupt * handler, then things like COW could be properly * handled. */ entry = pmd_clear_uffd_wp(entry); /* See change_pte_range(). */ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && can_change_pmd_writable(vma, addr, entry)) entry = pmd_mkwrite(entry, vma); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); if (huge_pmd_needs_flush(oldpmd, entry)) tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); unlock: spin_unlock(ptl); return ret; } /* * Returns: * * - 0: if pud leaf changed from under us * - 1: if pud can be skipped * - HPAGE_PUD_NR: if pud was successfully processed */ #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pudp, unsigned long addr, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pud_t oldpud, entry; spinlock_t *ptl; tlb_change_page_size(tlb, HPAGE_PUD_SIZE); /* NUMA balancing doesn't apply to dax */ if (cp_flags & MM_CP_PROT_NUMA) return 1; /* * Huge entries on userfault-wp only works with anonymous, while we * don't have anonymous PUDs yet. */ if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL)) return 1; ptl = __pud_trans_huge_lock(pudp, vma); if (!ptl) return 0; /* * Can't clear PUD or it can race with concurrent zapping. See * change_huge_pmd(). */ oldpud = pudp_invalidate(vma, addr, pudp); entry = pud_modify(oldpud, newprot); set_pud_at(mm, addr, pudp, entry); tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE); spin_unlock(ptl); return HPAGE_PUD_NR; } #endif #ifdef CONFIG_USERFAULTFD /* * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by * the caller, but it must return after releasing the page_table_lock. * Just move the page from src_pmd to dst_pmd if possible. * Return zero if succeeded in moving the page, -EAGAIN if it needs to be * repeated by the caller, or other errors in case of failure. */ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr) { pmd_t _dst_pmd, src_pmdval; struct page *src_page; struct folio *src_folio; struct anon_vma *src_anon_vma; spinlock_t *src_ptl, *dst_ptl; pgtable_t src_pgtable; struct mmu_notifier_range range; int err = 0; src_pmdval = *src_pmd; src_ptl = pmd_lockptr(mm, src_pmd); lockdep_assert_held(src_ptl); vma_assert_locked(src_vma); vma_assert_locked(dst_vma); /* Sanity checks before the operation */ if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) || WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) { spin_unlock(src_ptl); return -EINVAL; } if (!pmd_trans_huge(src_pmdval)) { spin_unlock(src_ptl); if (is_pmd_migration_entry(src_pmdval)) { pmd_migration_entry_wait(mm, &src_pmdval); return -EAGAIN; } return -ENOENT; } src_page = pmd_page(src_pmdval); if (!is_huge_zero_pmd(src_pmdval)) { if (unlikely(!PageAnonExclusive(src_page))) { spin_unlock(src_ptl); return -EBUSY; } src_folio = page_folio(src_page); folio_get(src_folio); } else src_folio = NULL; spin_unlock(src_ptl); flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, src_addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); if (src_folio) { folio_lock(src_folio); /* * split_huge_page walks the anon_vma chain without the page * lock. Serialize against it with the anon_vma lock, the page * lock is not enough. */ src_anon_vma = folio_get_anon_vma(src_folio); if (!src_anon_vma) { err = -EAGAIN; goto unlock_folio; } anon_vma_lock_write(src_anon_vma); } else src_anon_vma = NULL; dst_ptl = pmd_lockptr(mm, dst_pmd); double_pt_lock(src_ptl, dst_ptl); if (unlikely(!pmd_same(*src_pmd, src_pmdval) || !pmd_same(*dst_pmd, dst_pmdval))) { err = -EAGAIN; goto unlock_ptls; } if (src_folio) { if (folio_maybe_dma_pinned(src_folio) || !PageAnonExclusive(&src_folio->page)) { err = -EBUSY; goto unlock_ptls; } if (WARN_ON_ONCE(!folio_test_head(src_folio)) || WARN_ON_ONCE(!folio_test_anon(src_folio))) { err = -EBUSY; goto unlock_ptls; } src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { set_pmd_at(mm, src_addr, src_pmd, src_pmdval); err = -EBUSY; goto unlock_ptls; } folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); } else { src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot); } set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd); src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd); pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable); unlock_ptls: double_pt_unlock(src_ptl, dst_ptl); if (src_anon_vma) { anon_vma_unlock_write(src_anon_vma); put_anon_vma(src_anon_vma); } unlock_folio: /* unblock rmap walks */ if (src_folio) folio_unlock(src_folio); mmu_notifier_invalidate_range_end(&range); if (src_folio) folio_put(src_folio); return err; } #endif /* CONFIG_USERFAULTFD */ /* * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. * * Note that if it returns page table lock pointer, this routine returns without * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pmd_lock(vma->vm_mm, pmd); if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) return ptl; spin_unlock(ptl); return NULL; } /* * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. * * Note that if it returns page table lock pointer, this routine returns without * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { spinlock_t *ptl; ptl = pud_lock(vma->vm_mm, pud); if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) return ptl; spin_unlock(ptl); return NULL; } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr) { spinlock_t *ptl; pud_t orig_pud; ptl = __pud_trans_huge_lock(pud, vma); if (!ptl) return 0; orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); arch_check_zapped_pud(vma, orig_pud); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { /* No support for anonymous PUD pages yet */ BUG(); } return 1; } static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, unsigned long haddr) { VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); count_vm_event(THP_SPLIT_PUD); pudp_huge_clear_flush(vma, haddr, pud); } void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address) { spinlock_t *ptl; struct mmu_notifier_range range; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) goto out; __split_huge_pud_locked(vma, pud, range.start); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } #else void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address) { } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) { struct mm_struct *mm = vma->vm_mm; pgtable_t pgtable; pmd_t _pmd, old_pmd; unsigned long addr; pte_t *pte; int i; /* * Leave pmd empty until pte is filled note that it is fine to delay * notification until mmu_notifier_invalidate_range_end() as we are * replacing a zero pmd write protected page with a zero pte write * protected page. * * See Documentation/mm/mmu_notifier.rst */ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte); for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry; entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot); entry = pte_mkspecial(entry); if (pmd_uffd_wp(old_pmd)) entry = pte_mkuffd_wp(entry); VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); pte++; } pte_unmap(pte - 1); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); } static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, unsigned long haddr, bool freeze) { struct mm_struct *mm = vma->vm_mm; struct folio *folio; struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; bool anon_exclusive = false, dirty = false; unsigned long addr; pte_t *pte; int i; VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); VM_BUG_ON_VMA(vma->vm_start > haddr, vma); VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); count_vm_event(THP_SPLIT_PMD); if (!vma_is_anonymous(vma)) { old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); if (vma_is_special_huge(vma)) return; if (unlikely(is_pmd_migration_entry(old_pmd))) { swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); folio = pfn_swap_entry_folio(entry); } else { page = pmd_page(old_pmd); folio = page_folio(page); if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) folio_mark_dirty(folio); if (!folio_test_referenced(folio) && pmd_young(old_pmd)) folio_set_referenced(folio); folio_remove_rmap_pmd(folio, page, vma); folio_put(folio); } add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); return; } if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below * inside __split_huge_pmd() ? * * We are going from a zero huge page write protected to zero * small page also write protected so it does not seems useful * to invalidate secondary mmu at this time. */ return __split_huge_zero_page_pmd(vma, haddr, pmd); } pmd_migration = is_pmd_migration_entry(*pmd); if (unlikely(pmd_migration)) { swp_entry_t entry; old_pmd = *pmd; entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); write = is_writable_migration_entry(entry); if (PageAnon(page)) anon_exclusive = is_readable_exclusive_migration_entry(entry); young = is_migration_entry_young(entry); dirty = is_migration_entry_dirty(entry); soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { /* * Up to this point the pmd is present and huge and userland has * the whole access to the hugepage during the split (which * happens in place). If we overwrite the pmd with the not-huge * version pointing to the pte here (which of course we could if * all CPUs were bug free), userland could trigger a small page * size TLB miss on the small sized TLB while the hugepage TLB * entry is still established in the huge TLB. Some CPU doesn't * like that. See * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum * 383 on page 105. Intel should be safe but is also warns that * it's only safe if the permission and cache attributes of the * two entries loaded in the two TLB is identical (which should * be the case here). But it is generally safer to never allow * small and huge TLB entries for the same virtual address to be * loaded simultaneously. So instead of doing "pmd_populate(); * flush_pmd_tlb_range();" we first mark the current pmd * notpresent (atomically because here the pmd_trans_huge must * remain set at all times on the pmd until the split is * complete for this pmd), then we flush the SMP TLB and finally * we write the non-huge version of the pmd entry with * pmd_populate. */ old_pmd = pmdp_invalidate(vma, haddr, pmd); page = pmd_page(old_pmd); folio = page_folio(page); if (pmd_dirty(old_pmd)) { dirty = true; folio_set_dirty(folio); } write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); uffd_wp = pmd_uffd_wp(old_pmd); VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); /* * Without "freeze", we'll simply split the PMD, propagating the * PageAnonExclusive() flag for each PTE by setting it for * each subpage -- no need to (temporarily) clear. * * With "freeze" we want to replace mapped pages by * migration entries right away. This is only possible if we * managed to clear PageAnonExclusive() -- see * set_pmd_migration_entry(). * * In case we cannot clear PageAnonExclusive(), split the PMD * only and let try_to_migrate_one() fail later. * * See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ anon_exclusive = PageAnonExclusive(page); if (freeze && anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) freeze = false; if (!freeze) { rmap_t rmap_flags = RMAP_NONE; folio_ref_add(folio, HPAGE_PMD_NR - 1); if (anon_exclusive) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, vma, haddr, rmap_flags); } } /* * Withdraw the table only after we mark the pmd entry invalid. * This's critical for some architectures (Power). */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte); /* * Note that NUMA hinting access restrictions are not transferred to * avoid any possibility of altering permissions across VMAs. */ if (freeze || pmd_migration) { for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { pte_t entry; swp_entry_t swp_entry; if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); else if (anon_exclusive) swp_entry = make_readable_exclusive_migration_entry( page_to_pfn(page + i)); else swp_entry = make_readable_migration_entry( page_to_pfn(page + i)); if (young) swp_entry = make_migration_entry_young(swp_entry); if (dirty) swp_entry = make_migration_entry_dirty(swp_entry); entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); if (uffd_wp) entry = pte_swp_mkuffd_wp(entry); VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_pte_at(mm, addr, pte + i, entry); } } else { pte_t entry; entry = mk_pte(page, READ_ONCE(vma->vm_page_prot)); if (write) entry = pte_mkwrite(entry, vma); if (!young) entry = pte_mkold(entry); /* NOTE: this may set soft-dirty too on some archs */ if (dirty) entry = pte_mkdirty(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); for (i = 0; i < HPAGE_PMD_NR; i++) VM_WARN_ON(!pte_none(ptep_get(pte + i))); set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR); } pte_unmap(pte); if (!pmd_migration) folio_remove_rmap_pmd(folio, page, vma); if (freeze) put_page(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); } void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze, struct folio *folio) { VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); VM_BUG_ON(freeze && !folio); /* * When the caller requests to set up a migration entry, we * require a folio to check the PMD against. Otherwise, there * is a risk of replacing the wrong folio. */ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)) { if (folio && folio != pmd_folio(*pmd)) return; __split_huge_pmd_locked(vma, pmd, address, freeze); } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio) { spinlock_t *ptl; struct mmu_notifier_range range; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); split_huge_pmd_locked(vma, range.start, pmd, freeze, folio); spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); if (!pmd) return; __split_huge_pmd(vma, pmd, address, freeze, folio); } static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) { /* * If the new address isn't hpage aligned and it could previously * contain an hugepage: check if we need to split an huge pmd. */ if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), ALIGN(address, HPAGE_PMD_SIZE))) split_huge_pmd_address(vma, address, false, NULL); } void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { /* Check if we need to split start first. */ split_huge_pmd_if_needed(vma, start); /* Check if we need to split end next. */ split_huge_pmd_if_needed(vma, end); /* * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); } } static void unmap_folio(struct folio *folio) { enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC | TTU_BATCH_FLUSH; VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (folio_test_pmd_mappable(folio)) ttu_flags |= TTU_SPLIT_HUGE_PMD; /* * Anon pages need migration entries to preserve them, but file * pages can simply be left unmapped, then faulted back on demand. * If that is ever changed (perhaps for mlock), update remap_page(). */ if (folio_test_anon(folio)) try_to_migrate(folio, ttu_flags); else try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); try_to_unmap_flush(); } static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio) { struct mm_struct *mm = vma->vm_mm; int ref_count, map_count; pmd_t orig_pmd = *pmdp; if (folio_test_dirty(folio) || pmd_dirty(orig_pmd)) return false; orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp); /* * Syncing against concurrent GUP-fast: * - clear PMD; barrier; read refcount * - inc refcount; barrier; read PMD */ smp_mb(); ref_count = folio_ref_count(folio); map_count = folio_mapcount(folio); /* * Order reads for folio refcount and dirty flag * (see comments in __remove_mapping()). */ smp_rmb(); /* * If the folio or its PMD is redirtied at this point, or if there * are unexpected references, we will give up to discard this folio * and remap it. * * The only folio refs must be one from isolation plus the rmap(s). */ if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) || ref_count != map_count + 1) { set_pmd_at(mm, addr, pmdp, orig_pmd); return false; } folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma); zap_deposited_table(mm, pmdp); add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); return true; } bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE)); if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio); return false; } static void remap_page(struct folio *folio, unsigned long nr, int flags) { int i = 0; /* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) { remove_migration_ptes(folio, folio, RMP_LOCKED | flags); i += folio_nr_pages(folio); if (i >= nr) break; folio = folio_next(folio); } } static void lru_add_page_tail(struct folio *folio, struct page *tail, struct lruvec *lruvec, struct list_head *list) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); VM_BUG_ON_FOLIO(PageLRU(tail), folio); lockdep_assert_held(&lruvec->lru_lock); if (list) { /* page reclaim is reclaiming a huge page */ VM_WARN_ON(folio_test_lru(folio)); get_page(tail); list_add_tail(&tail->lru, list); } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!folio_test_lru(folio)); if (folio_test_unevictable(folio)) tail->mlock_count = 0; else list_add_tail(&tail->lru, &folio->lru); SetPageLRU(tail); } } static void __split_huge_page_tail(struct folio *folio, int tail, struct lruvec *lruvec, struct list_head *list, unsigned int new_order) { struct page *head = &folio->page; struct page *page_tail = head + tail; /* * Careful: new_folio is not a "real" folio before we cleared PageTail. * Don't pass it around before clear_compound_head(). */ struct folio *new_folio = (struct folio *)page_tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); /* * Clone page flags before unfreezing refcount. * * After successful get_page_unless_zero() might follow flags change, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_swapcache) | (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | #ifdef CONFIG_ARCH_USES_PG_ARCH_2 (1L << PG_arch_2) | #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3 (1L << PG_arch_3) | #endif (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); new_folio->mapping = folio->mapping; new_folio->index = folio->index + tail; /* * page->private should not be set in tail pages. Fix up and warn once * if private is unexpectedly set. */ if (unlikely(page_tail->private)) { VM_WARN_ON_ONCE_PAGE(true, page_tail); page_tail->private = 0; } if (folio_test_swapcache(folio)) new_folio->swap.val = folio->swap.val + tail; /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); /* * Clear PageTail before unfreezing page refcount. * * After successful get_page_unless_zero() might follow put_page() * which needs correct compound_head(). */ clear_compound_head(page_tail); if (new_order) { prep_compound_page(page_tail, new_order); folio_set_large_rmappable(new_folio); } /* Finally unfreeze refcount. Additional reference from page cache. */ page_ref_unfreeze(page_tail, 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ? folio_nr_pages(new_folio) : 0)); if (folio_test_young(folio)) folio_set_young(new_folio); if (folio_test_idle(folio)) folio_set_idle(new_folio); folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); /* * always add to the tail because some iterators expect new * pages to show after the currently processed elements - e.g. * migrate_pages */ lru_add_page_tail(folio, page_tail, lruvec, list); } static void __split_huge_page(struct page *page, struct list_head *list, pgoff_t end, unsigned int new_order) { struct folio *folio = page_folio(page); struct page *head = &folio->page; struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; int i, nr_dropped = 0; unsigned int new_nr = 1 << new_order; int order = folio_order(folio); unsigned int nr = 1 << order; /* complete memcg works before add pages to LRU */ split_page_memcg(head, order, new_order); if (folio_test_anon(folio) && folio_test_swapcache(folio)) { offset = swap_cache_index(folio->swap); swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); folio_clear_has_hwpoisoned(folio); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { struct folio *tail; __split_huge_page_tail(folio, i, lruvec, list, new_order); tail = page_folio(head + i); /* Some pages can be beyond EOF: drop them from page cache */ if (tail->index >= end) { if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); folio_put(tail); } else if (!folio_test_anon(folio)) { __xa_store(&folio->mapping->i_pages, tail->index, tail, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, tail, 0); } } if (!new_order) ClearPageCompound(head); else { struct folio *new_folio = (struct folio *)head; folio_set_order(new_folio, new_order); } unlock_page_lruvec(lruvec); /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, order, new_order); pgalloc_tag_split(folio, order, new_order); /* See comment in __split_huge_page_tail() */ if (folio_test_anon(folio)) { /* Additional pin to swap cache */ if (folio_test_swapcache(folio)) { folio_ref_add(folio, 1 + new_nr); xa_unlock(&swap_cache->i_pages); } else { folio_ref_inc(folio); } } else { /* Additional pin to page cache */ folio_ref_add(folio, 1 + new_nr); xa_unlock(&folio->mapping->i_pages); } local_irq_enable(); if (nr_dropped) shmem_uncharge(folio->mapping->host, nr_dropped); remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0); /* * set page to its compound_head when split to non order-0 pages, so * we can skip unlocking it below, since PG_locked is transferred to * the compound_head of the page and the caller will unlock it. */ if (new_order) page = compound_head(page); for (i = 0; i < nr; i += new_nr) { struct page *subpage = head + i; struct folio *new_folio = page_folio(subpage); if (subpage == page) continue; folio_unlock(new_folio); /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that * had its mapping zapped. And freeing these pages * requires taking the lru_lock so we do the put_page * of the tail pages after the split is complete. */ free_page_and_swap_cache(subpage); } } /* Racy check whether the huge page can be split */ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) { int extra_pins; /* Additional pins from page cache */ if (folio_test_anon(folio)) extra_pins = folio_test_swapcache(folio) ? folio_nr_pages(folio) : 0; else extra_pins = folio_nr_pages(folio); if (pextra_pins) *pextra_pins = extra_pins; return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - caller_pins; } /* * This function splits a large folio into smaller folios of order @new_order. * @page can point to any page of the large folio to split. The split operation * does not change the position of @page. * * Prerequisites: * * 1) The caller must hold a reference on the @page's owning folio, also known * as the large folio. * * 2) The large folio must be locked. * * 3) The folio must not be pinned. Any unexpected folio references, including * GUP pins, will result in the folio not getting split; instead, the caller * will receive an -EAGAIN. * * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not * supported for non-file-backed folios, because folio->_deferred_list, which * is used by partially mapped folios, is stored in subpage 2, but an order-1 * folio only has subpages 0 and 1. File-backed order-1 folios are supported, * since they do not use _deferred_list. * * After splitting, the caller's folio reference will be transferred to @page, * resulting in a raised refcount of @page after this call. The other pages may * be freed if they are not mapped. * * If @list is null, tail pages will be added to LRU list, otherwise, to @list. * * Pages in @new_order will inherit the mapping, flags, and so on from the * huge page. * * Returns 0 if the huge page was split successfully. * * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if * the folio was concurrently removed from the page cache. * * Returns -EBUSY when trying to split the huge zeropage, if the folio is * under writeback, if fs-specific folio metadata cannot currently be * released, or if some unexpected race happened (e.g., anon VMA disappeared, * truncation). * * Callers should ensure that the order respects the address space mapping * min-order if one is set for non-anonymous folios. * * Returns -EINVAL when trying to split to an order that is incompatible * with the folio. Splitting to order 0 is compatible with all folios. */ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { struct folio *folio = page_folio(page); struct deferred_split *ds_queue = get_deferred_split_queue(folio); /* reset xarray order to new order after split */ XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order); bool is_anon = folio_test_anon(folio); struct address_space *mapping = NULL; struct anon_vma *anon_vma = NULL; int order = folio_order(folio); int extra_pins, ret; pgoff_t end; bool is_hzp; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (new_order >= folio_order(folio)) return -EINVAL; if (is_anon) { /* order-1 is not supported for anonymous THP. */ if (new_order == 1) { VM_WARN_ONCE(1, "Cannot split to order-1 folio"); return -EINVAL; } } else if (new_order) { /* Split shmem folio to non-zero order not supported */ if (shmem_mapping(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split shmem folio to non-0 order"); return -EINVAL; } /* * No split if the file system does not support large folio. * Note that we might still have THPs in such mappings due to * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping * does not actually support large folios properly. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !mapping_large_folio_support(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split file folio to non-0 order"); return -EINVAL; } } /* Only swapping a whole PMD-mapped folio is supported */ if (folio_test_swapcache(folio) && new_order) return -EINVAL; is_hzp = is_huge_zero_folio(folio); if (is_hzp) { pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); return -EBUSY; } if (folio_test_writeback(folio)) return -EBUSY; if (is_anon) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a * reference to it and then lock the anon_vma for write. This * is similar to folio_lock_anon_vma_read except the write lock * is taken to serialise against parallel split or collapse * operations. */ anon_vma = folio_get_anon_vma(folio); if (!anon_vma) { ret = -EBUSY; goto out; } end = -1; mapping = NULL; anon_vma_lock_write(anon_vma); } else { unsigned int min_order; gfp_t gfp; mapping = folio->mapping; /* Truncated ? */ if (!mapping) { ret = -EBUSY; goto out; } min_order = mapping_min_folio_order(folio->mapping); if (new_order < min_order) { VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", min_order); ret = -EINVAL; goto out; } gfp = current_gfp_context(mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); if (!filemap_release_folio(folio, gfp)) { ret = -EBUSY; goto out; } xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; } anon_vma = NULL; i_mmap_lock_read(mapping); /* *__split_huge_page() may need to trim off pages beyond EOF: * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, * which cannot be nested inside the page tree lock. So note * end now: i_size itself may be changed at any moment, but * folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) end = shmem_fallocend(mapping->host, end); } /* * Racy check if we can split the page, before unmap_folio() will * split PMDs */ if (!can_split_folio(folio, 1, &extra_pins)) { ret = -EAGAIN; goto out_unlock; } unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { /* * Check if the folio is present in page cache. * We assume all tail are present too, if folio is there. */ xas_lock(&xas); xas_reset(&xas); if (xas_load(&xas) != folio) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { if (folio_order(folio) > 1 && !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } /* * Reinitialize page_deferred_list after removing the * page from the split_queue, otherwise a subsequent * split will see list corruption when checking the * page_deferred_list. */ list_del_init(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { int nr = folio_nr_pages(folio); xas_split(&xas, folio, folio_order(folio)); if (folio_test_pmd_mappable(folio) && new_order < HPAGE_PMD_ORDER) { if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } } } if (is_anon) { mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order)); } __split_huge_page(page, list, end, new_order); ret = 0; } else { spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xas_unlock(&xas); local_irq_enable(); remap_page(folio, folio_nr_pages(folio), 0); ret = -EAGAIN; } out_unlock: if (anon_vma) { anon_vma_unlock_write(anon_vma); put_anon_vma(anon_vma); } if (mapping) i_mmap_unlock_read(mapping); out: xas_destroy(&xas); if (order == HPAGE_PMD_ORDER) count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); return ret; } int min_order_for_split(struct folio *folio) { if (folio_test_anon(folio)) return 0; if (!folio->mapping) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_SPLIT_PAGE_FAILED); return -EBUSY; } return mapping_min_folio_order(folio->mapping); } int split_folio_to_list(struct folio *folio, struct list_head *list) { int ret = min_order_for_split(folio); if (ret < 0) return ret; return split_huge_page_to_list_to_order(&folio->page, list, ret); } /* * __folio_unqueue_deferred_split() is not to be called directly: * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h * limits its calls to those folios which may have a _deferred_list for * queueing THP splits, and that list is (racily observed to be) non-empty. * * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is * zero: because even when split_queue_lock is held, a non-empty _deferred_list * might be in use on deferred_split_scan()'s unlocked on-stack list. * * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is * therefore important to unqueue deferred split before changing folio memcg. */ bool __folio_unqueue_deferred_split(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; bool unqueued = false; WARN_ON_ONCE(folio_ref_count(folio)); WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } list_del_init(&folio->_deferred_list); unqueued = true; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); return unqueued; /* useful for debug warnings */ } /* partially_mapped=false won't clear PG_partially_mapped folio flag */ void deferred_split_folio(struct folio *folio, bool partially_mapped) { struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG struct mem_cgroup *memcg = folio_memcg(folio); #endif unsigned long flags; /* * Order 1 folios have no space for a deferred list, but we also * won't waste much memory by not adding them to the deferred list. */ if (folio_order(folio) <= 1) return; if (!partially_mapped && !split_underused_thp) return; /* * Exclude swapcache: originally to avoid a corrupt deferred split * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); * but if page reclaim is already handling the same folio, it is * unnecessary to handle it again in the shrinker, so excluding * swapcache here may still be a useful optimization. */ if (folio_test_swapcache(folio)) return; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { folio_set_partially_mapped(folio); if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1); } } else { /* partially mapped folios cannot become non-partially mapped */ VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio); } if (list_empty(&folio->_deferred_list)) { list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), deferred_split_shrinker->id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } static unsigned long deferred_split_count(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; #ifdef CONFIG_MEMCG if (sc->memcg) ds_queue = &sc->memcg->deferred_split_queue; #endif return READ_ONCE(ds_queue->split_queue_len); } static bool thp_underused(struct folio *folio) { int num_zero_pages = 0, num_filled_pages = 0; void *kaddr; int i; if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) return false; for (i = 0; i < folio_nr_pages(folio); i++) { kaddr = kmap_local_folio(folio, i * PAGE_SIZE); if (!memchr_inv(kaddr, 0, PAGE_SIZE)) { num_zero_pages++; if (num_zero_pages > khugepaged_max_ptes_none) { kunmap_local(kaddr); return true; } } else { /* * Another path for early exit once the number * of non-zero filled pages exceeds threshold. */ num_filled_pages++; if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) { kunmap_local(kaddr); return false; } } kunmap_local(kaddr); } return false; } static unsigned long deferred_split_scan(struct shrinker *shrink, struct shrink_control *sc) { struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list); struct folio *folio, *next, *prev = NULL; int split = 0, removed = 0; #ifdef CONFIG_MEMCG if (sc->memcg) ds_queue = &sc->memcg->deferred_split_queue; #endif spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ list_for_each_entry_safe(folio, next, &ds_queue->split_queue, _deferred_list) { if (folio_try_get(folio)) { list_move(&folio->_deferred_list, &list); } else { /* We lost race with folio_put() */ if (folio_test_partially_mapped(folio)) { folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) break; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); list_for_each_entry_safe(folio, next, &list, _deferred_list) { bool did_split = false; bool underused = false; if (!folio_test_partially_mapped(folio)) { underused = thp_underused(folio); if (!underused) goto next; } if (!folio_trylock(folio)) goto next; if (!split_folio(folio)) { did_split = true; if (underused) count_vm_event(THP_UNDERUSED_SPLIT_PAGE); split++; } folio_unlock(folio); next: /* * split_folio() removes folio from list on success. * Only add back to the queue if folio is partially mapped. * If thp_underused returns false, or if split_folio fails * in the case it was underused, then consider it used and * don't add it back to split_queue. */ if (did_split) { ; /* folio already removed from list */ } else if (!folio_test_partially_mapped(folio)) { list_del_init(&folio->_deferred_list); removed++; } else { /* * That unlocked list_del_init() above would be unsafe, * unless its folio is separated from any earlier folios * left on the list (which may be concurrently unqueued) * by one safe folio with refcount still raised. */ swap(folio, prev); } if (folio) folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); list_splice_tail(&list, &ds_queue->split_queue); ds_queue->split_queue_len -= removed; spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); if (prev) folio_put(prev); /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. */ if (!split && list_empty(&ds_queue->split_queue)) return SHRINK_STOP; return split; } #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { struct zone *zone; struct page *page; struct folio *folio; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; pr_debug("Split all THPs\n"); for_each_zone(zone) { if (!managed_zone(zone)) continue; max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { int nr_pages; page = pfn_to_online_page(pfn); if (!page || PageTail(page)) continue; folio = page_folio(page); if (!folio_try_get(folio)) continue; if (unlikely(page_folio(page) != folio)) goto next; if (zone != folio_zone(folio)) goto next; if (!folio_test_large(folio) || folio_test_hugetlb(folio) || !folio_test_lru(folio)) goto next; total++; folio_lock(folio); nr_pages = folio_nr_pages(folio); if (!split_folio(folio)) split++; pfn += nr_pages - 1; folio_unlock(folio); next: folio_put(folio); cond_resched(); } } pr_debug("%lu of %lu THP split\n", split, total); } static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) { return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || is_vm_hugetlb_page(vma); } static int split_huge_pages_pid(int pid, unsigned long vaddr_start, unsigned long vaddr_end, unsigned int new_order) { int ret = 0; struct task_struct *task; struct mm_struct *mm; unsigned long total = 0, split = 0; unsigned long addr; vaddr_start &= PAGE_MASK; vaddr_end &= PAGE_MASK; task = find_get_task_by_vpid(pid); if (!task) { ret = -ESRCH; goto out; } /* Find the mm_struct */ mm = get_task_mm(task); put_task_struct(task); if (!mm) { ret = -EINVAL; goto out; } pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", pid, vaddr_start, vaddr_end); mmap_read_lock(mm); /* * always increase addr by PAGE_SIZE, since we could have a PTE page * table filled with PTE-mapped THPs, each of which is distinct. */ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct folio_walk fw; struct folio *folio; struct address_space *mapping; unsigned int target_order = new_order; if (!vma) break; /* skip special VMA and hugetlb VMA */ if (vma_not_suitable_for_thp_split(vma)) { addr = vma->vm_end; continue; } folio = folio_walk_start(&fw, vma, addr, 0); if (!folio) continue; if (!is_transparent_hugepage(folio)) goto next; if (!folio_test_anon(folio)) { mapping = folio->mapping; target_order = max(new_order, mapping_min_folio_order(mapping)); } if (target_order >= folio_order(folio)) goto next; total++; /* * For folios with private, split_huge_page_to_list_to_order() * will try to drop it before split and then check if the folio * can be split or not. So skip the check here. */ if (!folio_test_private(folio) && !can_split_folio(folio, 0, NULL)) goto next; if (!folio_trylock(folio)) goto next; folio_get(folio); folio_walk_end(&fw, vma); if (!folio_test_anon(folio) && folio->mapping != mapping) goto unlock; if (!split_folio_to_order(folio, target_order)) split++; unlock: folio_unlock(folio); folio_put(folio); cond_resched(); continue; next: folio_walk_end(&fw, vma); cond_resched(); } mmap_read_unlock(mm); mmput(mm); pr_debug("%lu of %lu THP split\n", split, total); out: return ret; } static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, pgoff_t off_end, unsigned int new_order) { struct filename *file; struct file *candidate; struct address_space *mapping; int ret = -EINVAL; pgoff_t index; int nr_pages = 1; unsigned long total = 0, split = 0; unsigned int min_order; unsigned int target_order; file = getname_kernel(file_path); if (IS_ERR(file)) return ret; candidate = file_open_name(file, O_RDONLY, 0); if (IS_ERR(candidate)) goto out; pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", file_path, off_start, off_end); mapping = candidate->f_mapping; min_order = mapping_min_folio_order(mapping); target_order = max(new_order, min_order); for (index = off_start; index < off_end; index += nr_pages) { struct folio *folio = filemap_get_folio(mapping, index); nr_pages = 1; if (IS_ERR(folio)) continue; if (!folio_test_large(folio)) goto next; total++; nr_pages = folio_nr_pages(folio); if (target_order >= folio_order(folio)) goto next; if (!folio_trylock(folio)) goto next; if (folio->mapping != mapping) goto unlock; if (!split_folio_to_order(folio, target_order)) split++; unlock: folio_unlock(folio); next: folio_put(folio); cond_resched(); } filp_close(candidate, NULL); ret = 0; pr_debug("%lu of %lu file-backed THP split\n", split, total); out: putname(file); return ret; } #define MAX_INPUT_BUF_SZ 255 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, size_t count, loff_t *ppops) { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; /* * hold pid, start_vaddr, end_vaddr, new_order or * file_path, off_start, off_end, new_order */ char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; unsigned int new_order = 0; ret = mutex_lock_interruptible(&split_debug_mutex); if (ret) return ret; ret = -EFAULT; memset(input_buf, 0, MAX_INPUT_BUF_SZ); if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) goto out; input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; if (input_buf[0] == '/') { char *tok; char *tok_buf = input_buf; char file_path[MAX_INPUT_BUF_SZ]; pgoff_t off_start = 0, off_end = 0; size_t input_len = strlen(input_buf); tok = strsep(&tok_buf, ","); if (tok && tok_buf) { strscpy(file_path, tok); } else { ret = -EINVAL; goto out; } ret = sscanf(tok_buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order); if (ret != 2 && ret != 3) { ret = -EINVAL; goto out; } ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order); if (!ret) ret = input_len; goto out; } ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order); if (ret == 1 && pid == 1) { split_huge_pages_all(); ret = strlen(input_buf); goto out; } else if (ret != 3 && ret != 4) { ret = -EINVAL; goto out; } ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order); if (!ret) ret = strlen(input_buf); out: mutex_unlock(&split_debug_mutex); return ret; } static const struct file_operations split_huge_pages_fops = { .owner = THIS_MODULE, .write = split_huge_pages_write, }; static int __init split_huge_pages_debugfs(void) { debugfs_create_file("split_huge_pages", 0200, NULL, NULL, &split_huge_pages_fops); return 0; } late_initcall(split_huge_pages_debugfs); #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { struct folio *folio = page_folio(page); struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; bool anon_exclusive; pmd_t pmdval; swp_entry_t entry; pmd_t pmdswp; if (!(pvmw->pmd && !pvmw->pte)) return 0; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) { set_pmd_at(mm, address, pvmw->pmd, pmdval); return -EBUSY; } if (pmd_dirty(pmdval)) folio_mark_dirty(folio); if (pmd_write(pmdval)) entry = make_writable_migration_entry(page_to_pfn(page)); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); else entry = make_readable_migration_entry(page_to_pfn(page)); if (pmd_young(pmdval)) entry = make_migration_entry_young(entry); if (pmd_dirty(pmdval)) entry = make_migration_entry_dirty(entry); pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); if (pmd_uffd_wp(pmdval)) pmdswp = pmd_swp_mkuffd_wp(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); folio_remove_rmap_pmd(folio, page, vma); folio_put(folio); trace_set_migration_pmd(address, pmd_val(pmdswp)); return 0; } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { struct folio *folio = page_folio(new); struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; swp_entry_t entry; if (!(pvmw->pmd && !pvmw->pte)) return; entry = pmd_to_swp_entry(*pvmw->pmd); folio_get(folio); pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) pmde = pmd_mkwrite(pmde, vma); if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_mkuffd_wp(pmde); if (!is_migration_entry_young(entry)) pmde = pmd_mkold(pmde); /* NOTE: this may contain setting soft-dirty on some archs */ if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pmde = pmd_mkdirty(pmde); if (folio_test_anon(folio)) { rmap_t rmap_flags = RMAP_NONE; if (!is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags); } else { folio_add_file_rmap_pmd(folio, new, vma); } VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new)); set_pmd_at(mm, haddr, pvmw->pmd, pmde); /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); trace_remove_migration_pmd(address, pmd_val(pmde)); } #endif
8 53 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ #ifndef _EXT4_EXTENTS #define _EXT4_EXTENTS #include "ext4.h" /* * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks * becomes very small, so index split, in-depth growing and * other hard changes happen much more often. * This is for debug purposes only. */ #define AGGRESSIVE_TEST_ /* * With EXTENTS_STATS defined, the number of blocks and extents * are collected in the truncate path. They'll be shown at * umount time. */ #define EXTENTS_STATS__ /* * If CHECK_BINSEARCH is defined, then the results of the binary search * will also be checked by linear search. */ #define CHECK_BINSEARCH__ /* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. */ #define EXT_STATS_ /* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. * For non-inode extent blocks, ext4_extent_tail * follows the array. */ /* * This is the extent tail on-disk structure. * All other extent structures are 12 bytes long. It turns out that * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which * covers all valid ext4 block sizes. Therefore, this tail structure can be * crammed into the end of the block without having to rebalance the tree. */ struct ext4_extent_tail { __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ }; /* * This is the extent on-disk structure. * It's used at the bottom of the tree. */ struct ext4_extent { __le32 ee_block; /* first logical block extent covers */ __le16 ee_len; /* number of blocks covered by extent */ __le16 ee_start_hi; /* high 16 bits of physical block */ __le32 ee_start_lo; /* low 32 bits of physical block */ }; /* * This is index on-disk structure. * It's used at all the levels except the bottom. */ struct ext4_extent_idx { __le32 ei_block; /* index covers logical blocks from 'block' */ __le32 ei_leaf_lo; /* pointer to the physical block of the next * * level. leaf or next index could be there */ __le16 ei_leaf_hi; /* high 16 bits of physical block */ __u16 ei_unused; }; /* * Each block (leaves and indexes), even inode-stored has header. */ struct ext4_extent_header { __le16 eh_magic; /* probably will support different formats */ __le16 eh_entries; /* number of valid entries */ __le16 eh_max; /* capacity of store in entries */ __le16 eh_depth; /* has tree real underlying blocks? */ __le32 eh_generation; /* generation of the tree */ }; #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) #define EXT4_MAX_EXTENT_DEPTH 5 #define EXT4_EXTENT_TAIL_OFFSET(hdr) \ (sizeof(struct ext4_extent_header) + \ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) static inline struct ext4_extent_tail * find_ext4_extent_tail(struct ext4_extent_header *eh) { return (struct ext4_extent_tail *)(((void *)eh) + EXT4_EXTENT_TAIL_OFFSET(eh)); } /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. * Truncate uses it to simulate recursive walking. */ struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; struct buffer_head *p_bh; }; /* * Used to record a portion of a cluster found at the beginning or end * of an extent while traversing the extent tree during space removal. * A partial cluster may be removed if it does not contain blocks shared * with extents that aren't being deleted (tofree state). Otherwise, * it cannot be removed (nofree state). */ struct partial_cluster { ext4_fsblk_t pclu; /* physical cluster number */ ext4_lblk_t lblk; /* logical block number within logical cluster */ enum {initial, tofree, nofree} state; }; /* * structure for external API */ /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this * particular extent is an initialized extent or an unwritten (i.e. * preallocated). * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an * unwritten one. In other words, if MSB of ee_len is set, it is an * unwritten extent with only one special scenario when ee_len = 0x8000. * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) #define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ ((struct ext4_extent *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_FIRST_INDEX(__hdr__) \ ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_HAS_FREE_INDEX(__path__) \ (le16_to_cpu((__path__)->p_hdr->eh_entries) \ < le16_to_cpu((__path__)->p_hdr->eh_max)) #define EXT_LAST_EXTENT(__hdr__) \ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { return (struct ext4_extent_header *) EXT4_I(inode)->i_data; } static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) { return (struct ext4_extent_header *) bh->b_data; } static inline unsigned short ext_depth(struct inode *inode) { return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); } static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) { return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? le16_to_cpu(ext->ee_len) : (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) { ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } /* * ext4_ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; block = le32_to_cpu(ex->ee_start_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; return block; } /* * ext4_idx_pblock: * combine low and high parts of a leaf physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) { ext4_fsblk_t block; block = le32_to_cpu(ix->ei_leaf_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; return block; } /* * ext4_ext_store_pblock: * stores a large physical block number into an extent struct, * breaking it into parts */ static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) { ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } /* * ext4_idx_store_pblock: * stores a large physical block number into an index struct, * breaking it into parts */ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) { ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } #endif /* _EXT4_EXTENTS */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 #endif #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; if (cpu_online(cpu)) iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; while (gap > 0) { unsigned int inc; inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); seq_write(p, zeros, 2 * inc); gap -= inc; } } static void show_all_irqs(struct seq_file *p) { unsigned int i, next = 0; for_each_active_irq(i) { show_irq_gap(p, i - next); seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); next = i + 1; } show_irq_gap(p, irq_get_nr_irqs() - next); } static int show_stat(struct seq_file *p, void *v) { int i, j; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); /* shift boot timestamp according to the timens offset */ timens_sub_boottime(&boottime); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; nice += cpustat[CPUTIME_NICE]; system += cpustat[CPUTIME_SYSTEM]; idle += get_idle_time(&kcpustat, i); iowait += get_iowait_time(&kcpustat, i); irq += cpustat[CPUTIME_IRQ]; softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); per_softirq_sums[j] += softirq_stat; sum_softirq += softirq_stat; } } sum += arch_irq_stat(); seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = cpustat[CPUTIME_USER]; nice = cpustat[CPUTIME_NICE]; system = cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(&kcpustat, i); iowait = get_iowait_time(&kcpustat, i); irq = cpustat[CPUTIME_IRQ]; softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); show_all_irqs(p); seq_printf(p, "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" "procs_running %u\n" "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; } static int stat_open(struct inode *inode, struct file *file) { unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * irq_get_nr_irqs(); return single_open_size(file, show_stat, NULL, size); } static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; static int __init proc_stat_init(void) { proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init);
1 2 14 2 13 1 1 11 2 1 1 20 1 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0 /* * fs/isofs/export.c * * (C) 2004 Paul Serice - The new inode scheme requires switching * from iget() to iget5_locked() which means * the NFS export operations have to be hand * coded because the default routines rely on * iget(). * * The following files are helpful: * * Documentation/filesystems/nfs/exporting.rst * fs/exportfs/expfs.c. */ #include "isofs.h" static struct dentry * isofs_export_iget(struct super_block *sb, unsigned long block, unsigned long offset, __u32 generation) { struct inode *inode; if (block == 0) return ERR_PTR(-ESTALE); inode = isofs_iget(sb, block, offset); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return d_obtain_alias(inode); } /* This function is surprisingly simple. The trick is understanding * that "child" is always a directory. So, to find its parent, you * simply need to find its ".." entry, normalize its block and offset, * and return the underlying inode. See the comments for * isofs_normalize_block_and_offset(). */ static struct dentry *isofs_export_get_parent(struct dentry *child) { unsigned long parent_block = 0; unsigned long parent_offset = 0; struct inode *child_inode = d_inode(child); struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); struct iso_directory_record *de = NULL; struct buffer_head * bh = NULL; struct dentry *rv = NULL; /* "child" must always be a directory. */ if (!S_ISDIR(child_inode->i_mode)) { printk(KERN_ERR "isofs: isofs_export_get_parent(): " "child is not a directory!\n"); rv = ERR_PTR(-EACCES); goto out; } /* It is an invariant that the directory offset is zero. If * it is not zero, it means the directory failed to be * normalized for some reason. */ if (e_child_inode->i_iget5_offset != 0) { printk(KERN_ERR "isofs: isofs_export_get_parent(): " "child directory not normalized!\n"); rv = ERR_PTR(-EACCES); goto out; } /* The child inode has been normalized such that its * i_iget5_block value points to the "." entry. Fortunately, * the ".." entry is located in the same block. */ parent_block = e_child_inode->i_iget5_block; /* Get the block in question. */ bh = sb_bread(child_inode->i_sb, parent_block); if (bh == NULL) { rv = ERR_PTR(-EACCES); goto out; } /* This is the "." entry. */ de = (struct iso_directory_record*)bh->b_data; /* The ".." entry is always the second entry. */ parent_offset = (unsigned long)isonum_711(de->length); de = (struct iso_directory_record*)(bh->b_data + parent_offset); /* Verify it is in fact the ".." entry. */ if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) { printk(KERN_ERR "isofs: Unable to find the \"..\" " "directory for NFS.\n"); rv = ERR_PTR(-EACCES); goto out; } /* Normalize */ isofs_normalize_block_and_offset(de, &parent_block, &parent_offset); rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block, parent_offset)); out: if (bh) brelse(bh); return rv; } static int isofs_export_encode_fh(struct inode *inode, __u32 *fh32, int *max_len, struct inode *parent) { struct iso_inode_info * ei = ISOFS_I(inode); int len = *max_len; int type = 1; __u16 *fh16 = (__u16*)fh32; /* * WARNING: max_len is 5 for NFSv2. Because of this * limitation, we use the lower 16 bits of fh32[1] to hold the * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ if (parent && (len < 5)) { *max_len = 5; return FILEID_INVALID; } else if (len < 3) { *max_len = 3; return FILEID_INVALID; } len = 3; fh32[0] = ei->i_iget5_block; fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ fh16[3] = 0; /* avoid leaking uninitialized data */ fh32[2] = inode->i_generation; if (parent) { struct iso_inode_info *eparent; eparent = ISOFS_I(parent); fh32[3] = eparent->i_iget5_block; fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */ fh32[4] = parent->i_generation; len = 5; type = 2; } *max_len = len; return type; } struct isofs_fid { u32 block; u16 offset; u16 parent_offset; u32 generation; u32 parent_block; u32 parent_generation; }; static struct dentry *isofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct isofs_fid *ifid = (struct isofs_fid *)fid; if (fh_len < 3 || fh_type > 2) return NULL; return isofs_export_iget(sb, ifid->block, ifid->offset, ifid->generation); } static struct dentry *isofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct isofs_fid *ifid = (struct isofs_fid *)fid; if (fh_len < 2 || fh_type != 2) return NULL; return isofs_export_iget(sb, fh_len > 2 ? ifid->parent_block : 0, ifid->parent_offset, fh_len > 4 ? ifid->parent_generation : 0); } const struct export_operations isofs_export_ops = { .encode_fh = isofs_export_encode_fh, .fh_to_dentry = isofs_fh_to_dentry, .fh_to_parent = isofs_fh_to_parent, .get_parent = isofs_export_get_parent, };
2 2 2 1 1 2 1 1 1 1 1 2 2 2 2 2 3 1 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 // SPDX-License-Identifier: GPL-2.0-only /* * hid-ft260.c - FTDI FT260 USB HID to I2C host bridge * * Copyright (c) 2021, Michael Zaidman <michaelz@xsightlabs.com> * * Data Sheet: * https://www.ftdichip.com/Support/Documents/DataSheets/ICs/DS_FT260.pdf */ #include "hid-ids.h" #include <linux/hidraw.h> #include <linux/i2c.h> #include <linux/module.h> #include <linux/usb.h> #ifdef DEBUG static int ft260_debug = 1; #else static int ft260_debug; #endif module_param_named(debug, ft260_debug, int, 0600); MODULE_PARM_DESC(debug, "Toggle FT260 debugging messages"); #define ft260_dbg(format, arg...) \ do { \ if (ft260_debug) \ pr_info("%s: " format, __func__, ##arg); \ } while (0) #define FT260_REPORT_MAX_LENGTH (64) #define FT260_I2C_DATA_REPORT_ID(len) (FT260_I2C_REPORT_MIN + (len - 1) / 4) #define FT260_WAKEUP_NEEDED_AFTER_MS (4800) /* 5s minus 200ms margin */ /* * The ft260 input report format defines 62 bytes for the data payload, but * when requested 62 bytes, the controller returns 60 and 2 in separate input * reports. To achieve better performance with the multi-report read data * transfers, we set the maximum read payload length to a multiple of 60. * With a 100 kHz I2C clock, one 240 bytes read takes about 1/27 second, * which is excessive; On the other hand, some higher layer drivers like at24 * or optoe limit the i2c reads to 128 bytes. To not block other drivers out * of I2C for potentially troublesome amounts of time, we select the maximum * read payload length to be 180 bytes. */ #define FT260_RD_DATA_MAX (180) #define FT260_WR_DATA_MAX (60) /* * Device interface configuration. * The FT260 has 2 interfaces that are controlled by DCNF0 and DCNF1 pins. * First implementes USB HID to I2C bridge function and * second - USB HID to UART bridge function. */ enum { FT260_MODE_ALL = 0x00, FT260_MODE_I2C = 0x01, FT260_MODE_UART = 0x02, FT260_MODE_BOTH = 0x03, }; /* Control pipe */ enum { FT260_GET_RQST_TYPE = 0xA1, FT260_GET_REPORT = 0x01, FT260_SET_RQST_TYPE = 0x21, FT260_SET_REPORT = 0x09, FT260_FEATURE = 0x03, }; /* Report IDs / Feature In */ enum { FT260_CHIP_VERSION = 0xA0, FT260_SYSTEM_SETTINGS = 0xA1, FT260_I2C_STATUS = 0xC0, FT260_I2C_READ_REQ = 0xC2, FT260_I2C_REPORT_MIN = 0xD0, FT260_I2C_REPORT_MAX = 0xDE, FT260_GPIO = 0xB0, FT260_UART_INTERRUPT_STATUS = 0xB1, FT260_UART_STATUS = 0xE0, FT260_UART_RI_DCD_STATUS = 0xE1, FT260_UART_REPORT = 0xF0, }; /* Feature Out */ enum { FT260_SET_CLOCK = 0x01, FT260_SET_I2C_MODE = 0x02, FT260_SET_UART_MODE = 0x03, FT260_ENABLE_INTERRUPT = 0x05, FT260_SELECT_GPIO2_FUNC = 0x06, FT260_ENABLE_UART_DCD_RI = 0x07, FT260_SELECT_GPIOA_FUNC = 0x08, FT260_SELECT_GPIOG_FUNC = 0x09, FT260_SET_INTERRUPT_TRIGGER = 0x0A, FT260_SET_SUSPEND_OUT_POLAR = 0x0B, FT260_ENABLE_UART_RI_WAKEUP = 0x0C, FT260_SET_UART_RI_WAKEUP_CFG = 0x0D, FT260_SET_I2C_RESET = 0x20, FT260_SET_I2C_CLOCK_SPEED = 0x22, FT260_SET_UART_RESET = 0x40, FT260_SET_UART_CONFIG = 0x41, FT260_SET_UART_BAUD_RATE = 0x42, FT260_SET_UART_DATA_BIT = 0x43, FT260_SET_UART_PARITY = 0x44, FT260_SET_UART_STOP_BIT = 0x45, FT260_SET_UART_BREAKING = 0x46, FT260_SET_UART_XON_XOFF = 0x49, }; /* Response codes in I2C status report */ enum { FT260_I2C_STATUS_SUCCESS = 0x00, FT260_I2C_STATUS_CTRL_BUSY = 0x01, FT260_I2C_STATUS_ERROR = 0x02, FT260_I2C_STATUS_ADDR_NO_ACK = 0x04, FT260_I2C_STATUS_DATA_NO_ACK = 0x08, FT260_I2C_STATUS_ARBITR_LOST = 0x10, FT260_I2C_STATUS_CTRL_IDLE = 0x20, FT260_I2C_STATUS_BUS_BUSY = 0x40, }; /* I2C Conditions flags */ enum { FT260_FLAG_NONE = 0x00, FT260_FLAG_START = 0x02, FT260_FLAG_START_REPEATED = 0x03, FT260_FLAG_STOP = 0x04, FT260_FLAG_START_STOP = 0x06, FT260_FLAG_START_STOP_REPEATED = 0x07, }; #define FT260_SET_REQUEST_VALUE(report_id) ((FT260_FEATURE << 8) | report_id) /* Feature In reports */ struct ft260_get_chip_version_report { u8 report; /* FT260_CHIP_VERSION */ u8 chip_code[4]; /* FTDI chip identification code */ u8 reserved[8]; } __packed; struct ft260_get_system_status_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 chip_mode; /* DCNF0 and DCNF1 status, bits 0-1 */ u8 clock_ctl; /* 0 - 12MHz, 1 - 24MHz, 2 - 48MHz */ u8 suspend_status; /* 0 - not suspended, 1 - suspended */ u8 pwren_status; /* 0 - FT260 is not ready, 1 - ready */ u8 i2c_enable; /* 0 - disabled, 1 - enabled */ u8 uart_mode; /* 0 - OFF; 1 - RTS_CTS, 2 - DTR_DSR, */ /* 3 - XON_XOFF, 4 - No flow control */ u8 hid_over_i2c_en; /* 0 - disabled, 1 - enabled */ u8 gpio2_function; /* 0 - GPIO, 1 - SUSPOUT, */ /* 2 - PWREN, 4 - TX_LED */ u8 gpioA_function; /* 0 - GPIO, 3 - TX_ACTIVE, 4 - TX_LED */ u8 gpioG_function; /* 0 - GPIO, 2 - PWREN, */ /* 5 - RX_LED, 6 - BCD_DET */ u8 suspend_out_pol; /* 0 - active-high, 1 - active-low */ u8 enable_wakeup_int; /* 0 - disabled, 1 - enabled */ u8 intr_cond; /* Interrupt trigger conditions */ u8 power_saving_en; /* 0 - disabled, 1 - enabled */ u8 reserved[10]; } __packed; struct ft260_get_i2c_status_report { u8 report; /* FT260_I2C_STATUS */ u8 bus_status; /* I2C bus status */ __le16 clock; /* I2C bus clock in range 60-3400 KHz */ u8 reserved; } __packed; /* Feature Out reports */ struct ft260_set_system_clock_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_CLOCK */ u8 clock_ctl; /* 0 - 12MHz, 1 - 24MHz, 2 - 48MHz */ } __packed; struct ft260_set_i2c_mode_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_I2C_MODE */ u8 i2c_enable; /* 0 - disabled, 1 - enabled */ } __packed; struct ft260_set_uart_mode_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_UART_MODE */ u8 uart_mode; /* 0 - OFF; 1 - RTS_CTS, 2 - DTR_DSR, */ /* 3 - XON_XOFF, 4 - No flow control */ } __packed; struct ft260_set_i2c_reset_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_I2C_RESET */ } __packed; struct ft260_set_i2c_speed_report { u8 report; /* FT260_SYSTEM_SETTINGS */ u8 request; /* FT260_SET_I2C_CLOCK_SPEED */ __le16 clock; /* I2C bus clock in range 60-3400 KHz */ } __packed; /* Data transfer reports */ struct ft260_i2c_write_request_report { u8 report; /* FT260_I2C_REPORT */ u8 address; /* 7-bit I2C address */ u8 flag; /* I2C transaction condition */ u8 length; /* data payload length */ u8 data[FT260_WR_DATA_MAX]; /* data payload */ } __packed; struct ft260_i2c_read_request_report { u8 report; /* FT260_I2C_READ_REQ */ u8 address; /* 7-bit I2C address */ u8 flag; /* I2C transaction condition */ __le16 length; /* data payload length */ } __packed; struct ft260_i2c_input_report { u8 report; /* FT260_I2C_REPORT */ u8 length; /* data payload length */ u8 data[2]; /* data payload */ } __packed; static const struct hid_device_id ft260_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_FUTURE_TECHNOLOGY, USB_DEVICE_ID_FT260) }, { /* END OF LIST */ } }; MODULE_DEVICE_TABLE(hid, ft260_devices); struct ft260_device { struct i2c_adapter adap; struct hid_device *hdev; struct completion wait; struct mutex lock; u8 write_buf[FT260_REPORT_MAX_LENGTH]; unsigned long need_wakeup_at; u8 *read_buf; u16 read_idx; u16 read_len; u16 clock; }; static int ft260_hid_feature_report_get(struct hid_device *hdev, unsigned char report_id, u8 *data, size_t len) { u8 *buf; int ret; buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; ret = hid_hw_raw_request(hdev, report_id, buf, len, HID_FEATURE_REPORT, HID_REQ_GET_REPORT); if (likely(ret == len)) memcpy(data, buf, len); else if (ret >= 0) ret = -EIO; kfree(buf); return ret; } static int ft260_hid_feature_report_set(struct hid_device *hdev, u8 *data, size_t len) { u8 *buf; int ret; buf = kmemdup(data, len, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = FT260_SYSTEM_SETTINGS; ret = hid_hw_raw_request(hdev, buf[0], buf, len, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); kfree(buf); return ret; } static int ft260_i2c_reset(struct hid_device *hdev) { struct ft260_set_i2c_reset_report report; int ret; report.request = FT260_SET_I2C_RESET; ret = ft260_hid_feature_report_set(hdev, (u8 *)&report, sizeof(report)); if (ret < 0) { hid_err(hdev, "failed to reset I2C controller: %d\n", ret); return ret; } ft260_dbg("done\n"); return ret; } static int ft260_xfer_status(struct ft260_device *dev, u8 bus_busy) { struct hid_device *hdev = dev->hdev; struct ft260_get_i2c_status_report report; int ret; if (time_is_before_jiffies(dev->need_wakeup_at)) { ret = ft260_hid_feature_report_get(hdev, FT260_I2C_STATUS, (u8 *)&report, sizeof(report)); if (unlikely(ret < 0)) { hid_err(hdev, "failed to retrieve status: %d, no wakeup\n", ret); } else { dev->need_wakeup_at = jiffies + msecs_to_jiffies(FT260_WAKEUP_NEEDED_AFTER_MS); ft260_dbg("bus_status %#02x, wakeup\n", report.bus_status); } } ret = ft260_hid_feature_report_get(hdev, FT260_I2C_STATUS, (u8 *)&report, sizeof(report)); if (unlikely(ret < 0)) { hid_err(hdev, "failed to retrieve status: %d\n", ret); return ret; } dev->clock = le16_to_cpu(report.clock); ft260_dbg("bus_status %#02x, clock %u\n", report.bus_status, dev->clock); if (report.bus_status & (FT260_I2C_STATUS_CTRL_BUSY | bus_busy)) return -EAGAIN; /* * The error condition (bit 1) is a status bit reflecting any * error conditions. When any of the bits 2, 3, or 4 are raised * to 1, bit 1 is also set to 1. */ if (report.bus_status & FT260_I2C_STATUS_ERROR) { hid_err(hdev, "i2c bus error: %#02x\n", report.bus_status); return -EIO; } return 0; } static int ft260_hid_output_report(struct hid_device *hdev, u8 *data, size_t len) { u8 *buf; int ret; buf = kmemdup(data, len, GFP_KERNEL); if (!buf) return -ENOMEM; ret = hid_hw_output_report(hdev, buf, len); kfree(buf); return ret; } static int ft260_hid_output_report_check_status(struct ft260_device *dev, u8 *data, int len) { u8 bus_busy; int ret, usec, try = 100; struct hid_device *hdev = dev->hdev; struct ft260_i2c_write_request_report *rep = (struct ft260_i2c_write_request_report *)data; ret = ft260_hid_output_report(hdev, data, len); if (ret < 0) { hid_err(hdev, "%s: failed to start transfer, ret %d\n", __func__, ret); ft260_i2c_reset(hdev); return ret; } /* transfer time = 1 / clock(KHz) * 9 bits * bytes */ usec = len * 9000 / dev->clock; if (usec > 2000) { usec -= 1500; usleep_range(usec, usec + 100); ft260_dbg("wait %d usec, len %d\n", usec, len); } /* * Do not check the busy bit for combined transactions * since the controller keeps the bus busy between writing * and reading IOs to ensure an atomic operation. */ if (rep->flag == FT260_FLAG_START) bus_busy = 0; else bus_busy = FT260_I2C_STATUS_BUS_BUSY; do { ret = ft260_xfer_status(dev, bus_busy); if (ret != -EAGAIN) break; } while (--try); if (ret == 0) return 0; ft260_i2c_reset(hdev); return -EIO; } static int ft260_i2c_write(struct ft260_device *dev, u8 addr, u8 *data, int len, u8 flag) { int ret, wr_len, idx = 0; struct hid_device *hdev = dev->hdev; struct ft260_i2c_write_request_report *rep = (struct ft260_i2c_write_request_report *)dev->write_buf; if (len < 1) return -EINVAL; rep->flag = FT260_FLAG_START; do { if (len <= FT260_WR_DATA_MAX) { wr_len = len; if (flag == FT260_FLAG_START_STOP) rep->flag |= FT260_FLAG_STOP; } else { wr_len = FT260_WR_DATA_MAX; } rep->report = FT260_I2C_DATA_REPORT_ID(wr_len); rep->address = addr; rep->length = wr_len; memcpy(rep->data, &data[idx], wr_len); ft260_dbg("rep %#02x addr %#02x off %d len %d wlen %d flag %#x d[0] %#02x\n", rep->report, addr, idx, len, wr_len, rep->flag, data[0]); ret = ft260_hid_output_report_check_status(dev, (u8 *)rep, wr_len + 4); if (ret < 0) { hid_err(hdev, "%s: failed with %d\n", __func__, ret); return ret; } len -= wr_len; idx += wr_len; rep->flag = 0; } while (len > 0); return 0; } static int ft260_smbus_write(struct ft260_device *dev, u8 addr, u8 cmd, u8 *data, u8 data_len, u8 flag) { int ret = 0; int len = 4; struct ft260_i2c_write_request_report *rep = (struct ft260_i2c_write_request_report *)dev->write_buf; if (data_len >= sizeof(rep->data)) return -EINVAL; rep->address = addr; rep->data[0] = cmd; rep->length = data_len + 1; rep->flag = flag; len += rep->length; rep->report = FT260_I2C_DATA_REPORT_ID(len); if (data_len > 0) memcpy(&rep->data[1], data, data_len); ft260_dbg("rep %#02x addr %#02x cmd %#02x datlen %d replen %d\n", rep->report, addr, cmd, rep->length, len); ret = ft260_hid_output_report_check_status(dev, (u8 *)rep, len); return ret; } static int ft260_i2c_read(struct ft260_device *dev, u8 addr, u8 *data, u16 len, u8 flag) { u16 rd_len; u16 rd_data_max = 60; int timeout, ret = 0; struct ft260_i2c_read_request_report rep; struct hid_device *hdev = dev->hdev; u8 bus_busy = 0; if ((flag & FT260_FLAG_START_REPEATED) == FT260_FLAG_START_REPEATED) flag = FT260_FLAG_START_REPEATED; else flag = FT260_FLAG_START; do { if (len <= rd_data_max) { rd_len = len; flag |= FT260_FLAG_STOP; } else { rd_len = rd_data_max; } rd_data_max = FT260_RD_DATA_MAX; rep.report = FT260_I2C_READ_REQ; rep.length = cpu_to_le16(rd_len); rep.address = addr; rep.flag = flag; ft260_dbg("rep %#02x addr %#02x len %d rlen %d flag %#x\n", rep.report, rep.address, len, rd_len, flag); reinit_completion(&dev->wait); dev->read_idx = 0; dev->read_buf = data; dev->read_len = rd_len; ret = ft260_hid_output_report(hdev, (u8 *)&rep, sizeof(rep)); if (ret < 0) { hid_err(hdev, "%s: failed with %d\n", __func__, ret); goto ft260_i2c_read_exit; } timeout = msecs_to_jiffies(5000); if (!wait_for_completion_timeout(&dev->wait, timeout)) { ret = -ETIMEDOUT; ft260_i2c_reset(hdev); goto ft260_i2c_read_exit; } dev->read_buf = NULL; if (flag & FT260_FLAG_STOP) bus_busy = FT260_I2C_STATUS_BUS_BUSY; ret = ft260_xfer_status(dev, bus_busy); if (ret < 0) { ret = -EIO; ft260_i2c_reset(hdev); goto ft260_i2c_read_exit; } len -= rd_len; data += rd_len; flag = 0; } while (len > 0); ft260_i2c_read_exit: dev->read_buf = NULL; return ret; } /* * A random read operation is implemented as a dummy write operation, followed * by a current address read operation. The dummy write operation is used to * load the target byte address into the current byte address counter, from * which the subsequent current address read operation then reads. */ static int ft260_i2c_write_read(struct ft260_device *dev, struct i2c_msg *msgs) { int ret; int wr_len = msgs[0].len; int rd_len = msgs[1].len; struct hid_device *hdev = dev->hdev; u8 addr = msgs[0].addr; u16 read_off = 0; if (wr_len > 2) { hid_err(hdev, "%s: invalid wr_len: %d\n", __func__, wr_len); return -EOPNOTSUPP; } if (ft260_debug) { if (wr_len == 2) read_off = be16_to_cpu(*(__be16 *)msgs[0].buf); else read_off = *msgs[0].buf; pr_info("%s: off %#x rlen %d wlen %d\n", __func__, read_off, rd_len, wr_len); } ret = ft260_i2c_write(dev, addr, msgs[0].buf, wr_len, FT260_FLAG_START); if (ret < 0) return ret; ret = ft260_i2c_read(dev, addr, msgs[1].buf, rd_len, FT260_FLAG_START_STOP_REPEATED); if (ret < 0) return ret; return 0; } static int ft260_i2c_xfer(struct i2c_adapter *adapter, struct i2c_msg *msgs, int num) { int ret; struct ft260_device *dev = i2c_get_adapdata(adapter); struct hid_device *hdev = dev->hdev; mutex_lock(&dev->lock); ret = hid_hw_power(hdev, PM_HINT_FULLON); if (ret < 0) { hid_err(hdev, "failed to enter FULLON power mode: %d\n", ret); mutex_unlock(&dev->lock); return ret; } if (num == 1) { if (msgs->flags & I2C_M_RD) ret = ft260_i2c_read(dev, msgs->addr, msgs->buf, msgs->len, FT260_FLAG_START_STOP); else ret = ft260_i2c_write(dev, msgs->addr, msgs->buf, msgs->len, FT260_FLAG_START_STOP); if (ret < 0) goto i2c_exit; } else { /* Combined write then read message */ ret = ft260_i2c_write_read(dev, msgs); if (ret < 0) goto i2c_exit; } ret = num; i2c_exit: hid_hw_power(hdev, PM_HINT_NORMAL); mutex_unlock(&dev->lock); return ret; } static int ft260_smbus_xfer(struct i2c_adapter *adapter, u16 addr, u16 flags, char read_write, u8 cmd, int size, union i2c_smbus_data *data) { int ret; struct ft260_device *dev = i2c_get_adapdata(adapter); struct hid_device *hdev = dev->hdev; ft260_dbg("smbus size %d\n", size); mutex_lock(&dev->lock); ret = hid_hw_power(hdev, PM_HINT_FULLON); if (ret < 0) { hid_err(hdev, "power management error: %d\n", ret); mutex_unlock(&dev->lock); return ret; } switch (size) { case I2C_SMBUS_BYTE: if (read_write == I2C_SMBUS_READ) ret = ft260_i2c_read(dev, addr, &data->byte, 1, FT260_FLAG_START_STOP); else ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START_STOP); break; case I2C_SMBUS_BYTE_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, &data->byte, 1, FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, &data->byte, 1, FT260_FLAG_START_STOP); } break; case I2C_SMBUS_WORD_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, (u8 *)&data->word, 2, FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, (u8 *)&data->word, 2, FT260_FLAG_START_STOP); } break; case I2C_SMBUS_BLOCK_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, data->block, data->block[0] + 1, FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, data->block, data->block[0] + 1, FT260_FLAG_START_STOP); } break; case I2C_SMBUS_I2C_BLOCK_DATA: if (read_write == I2C_SMBUS_READ) { ret = ft260_smbus_write(dev, addr, cmd, NULL, 0, FT260_FLAG_START); if (ret) goto smbus_exit; ret = ft260_i2c_read(dev, addr, data->block + 1, data->block[0], FT260_FLAG_START_STOP_REPEATED); } else { ret = ft260_smbus_write(dev, addr, cmd, data->block + 1, data->block[0], FT260_FLAG_START_STOP); } break; default: hid_err(hdev, "unsupported smbus transaction size %d\n", size); ret = -EOPNOTSUPP; } smbus_exit: hid_hw_power(hdev, PM_HINT_NORMAL); mutex_unlock(&dev->lock); return ret; } static u32 ft260_functionality(struct i2c_adapter *adap) { return I2C_FUNC_I2C | I2C_FUNC_SMBUS_BYTE | I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | I2C_FUNC_SMBUS_BLOCK_DATA | I2C_FUNC_SMBUS_I2C_BLOCK; } static const struct i2c_adapter_quirks ft260_i2c_quirks = { .flags = I2C_AQ_COMB_WRITE_THEN_READ, .max_comb_1st_msg_len = 2, }; static const struct i2c_algorithm ft260_i2c_algo = { .master_xfer = ft260_i2c_xfer, .smbus_xfer = ft260_smbus_xfer, .functionality = ft260_functionality, }; static int ft260_get_system_config(struct hid_device *hdev, struct ft260_get_system_status_report *cfg) { int ret; int len = sizeof(struct ft260_get_system_status_report); ret = ft260_hid_feature_report_get(hdev, FT260_SYSTEM_SETTINGS, (u8 *)cfg, len); if (ret < 0) { hid_err(hdev, "failed to retrieve system status\n"); return ret; } return 0; } static int ft260_is_interface_enabled(struct hid_device *hdev) { struct ft260_get_system_status_report cfg; struct usb_interface *usbif = to_usb_interface(hdev->dev.parent); int interface = usbif->cur_altsetting->desc.bInterfaceNumber; int ret; ret = ft260_get_system_config(hdev, &cfg); if (ret < 0) return ret; ft260_dbg("interface: 0x%02x\n", interface); ft260_dbg("chip mode: 0x%02x\n", cfg.chip_mode); ft260_dbg("clock_ctl: 0x%02x\n", cfg.clock_ctl); ft260_dbg("i2c_enable: 0x%02x\n", cfg.i2c_enable); ft260_dbg("uart_mode: 0x%02x\n", cfg.uart_mode); switch (cfg.chip_mode) { case FT260_MODE_ALL: case FT260_MODE_BOTH: if (interface == 1) hid_info(hdev, "uart interface is not supported\n"); else ret = 1; break; case FT260_MODE_UART: hid_info(hdev, "uart interface is not supported\n"); break; case FT260_MODE_I2C: ret = 1; break; } return ret; } static int ft260_byte_show(struct hid_device *hdev, int id, u8 *cfg, int len, u8 *field, u8 *buf) { int ret; ret = ft260_hid_feature_report_get(hdev, id, cfg, len); if (ret < 0) return ret; return scnprintf(buf, PAGE_SIZE, "%d\n", *field); } static int ft260_word_show(struct hid_device *hdev, int id, u8 *cfg, int len, __le16 *field, u8 *buf) { int ret; ret = ft260_hid_feature_report_get(hdev, id, cfg, len); if (ret < 0) return ret; return scnprintf(buf, PAGE_SIZE, "%d\n", le16_to_cpu(*field)); } #define FT260_ATTR_SHOW(name, reptype, id, type, func) \ static ssize_t name##_show(struct device *kdev, \ struct device_attribute *attr, char *buf) \ { \ struct reptype rep; \ struct hid_device *hdev = to_hid_device(kdev); \ type *field = &rep.name; \ int len = sizeof(rep); \ \ return func(hdev, id, (u8 *)&rep, len, field, buf); \ } #define FT260_SSTAT_ATTR_SHOW(name) \ FT260_ATTR_SHOW(name, ft260_get_system_status_report, \ FT260_SYSTEM_SETTINGS, u8, ft260_byte_show) #define FT260_I2CST_ATTR_SHOW(name) \ FT260_ATTR_SHOW(name, ft260_get_i2c_status_report, \ FT260_I2C_STATUS, __le16, ft260_word_show) #define FT260_ATTR_STORE(name, reptype, id, req, type, ctype, func) \ static ssize_t name##_store(struct device *kdev, \ struct device_attribute *attr, \ const char *buf, size_t count) \ { \ struct reptype rep; \ struct hid_device *hdev = to_hid_device(kdev); \ type name; \ int ret; \ \ if (!func(buf, 10, (ctype *)&name)) { \ rep.name = name; \ rep.report = id; \ rep.request = req; \ ret = ft260_hid_feature_report_set(hdev, (u8 *)&rep, \ sizeof(rep)); \ if (!ret) \ ret = count; \ } else { \ ret = -EINVAL; \ } \ return ret; \ } #define FT260_BYTE_ATTR_STORE(name, reptype, req) \ FT260_ATTR_STORE(name, reptype, FT260_SYSTEM_SETTINGS, req, \ u8, u8, kstrtou8) #define FT260_WORD_ATTR_STORE(name, reptype, req) \ FT260_ATTR_STORE(name, reptype, FT260_SYSTEM_SETTINGS, req, \ __le16, u16, kstrtou16) FT260_SSTAT_ATTR_SHOW(chip_mode); static DEVICE_ATTR_RO(chip_mode); FT260_SSTAT_ATTR_SHOW(pwren_status); static DEVICE_ATTR_RO(pwren_status); FT260_SSTAT_ATTR_SHOW(suspend_status); static DEVICE_ATTR_RO(suspend_status); FT260_SSTAT_ATTR_SHOW(hid_over_i2c_en); static DEVICE_ATTR_RO(hid_over_i2c_en); FT260_SSTAT_ATTR_SHOW(power_saving_en); static DEVICE_ATTR_RO(power_saving_en); FT260_SSTAT_ATTR_SHOW(i2c_enable); FT260_BYTE_ATTR_STORE(i2c_enable, ft260_set_i2c_mode_report, FT260_SET_I2C_MODE); static DEVICE_ATTR_RW(i2c_enable); FT260_SSTAT_ATTR_SHOW(uart_mode); FT260_BYTE_ATTR_STORE(uart_mode, ft260_set_uart_mode_report, FT260_SET_UART_MODE); static DEVICE_ATTR_RW(uart_mode); FT260_SSTAT_ATTR_SHOW(clock_ctl); FT260_BYTE_ATTR_STORE(clock_ctl, ft260_set_system_clock_report, FT260_SET_CLOCK); static DEVICE_ATTR_RW(clock_ctl); FT260_I2CST_ATTR_SHOW(clock); FT260_WORD_ATTR_STORE(clock, ft260_set_i2c_speed_report, FT260_SET_I2C_CLOCK_SPEED); static DEVICE_ATTR_RW(clock); static ssize_t i2c_reset_store(struct device *kdev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(kdev); int ret = ft260_i2c_reset(hdev); if (ret) return ret; return count; } static DEVICE_ATTR_WO(i2c_reset); static const struct attribute_group ft260_attr_group = { .attrs = (struct attribute *[]) { &dev_attr_chip_mode.attr, &dev_attr_pwren_status.attr, &dev_attr_suspend_status.attr, &dev_attr_hid_over_i2c_en.attr, &dev_attr_power_saving_en.attr, &dev_attr_i2c_enable.attr, &dev_attr_uart_mode.attr, &dev_attr_clock_ctl.attr, &dev_attr_i2c_reset.attr, &dev_attr_clock.attr, NULL } }; static int ft260_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct ft260_device *dev; struct ft260_get_chip_version_report version; int ret; if (!hid_is_usb(hdev)) return -EINVAL; dev = devm_kzalloc(&hdev->dev, sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "failed to parse HID\n"); return ret; } ret = hid_hw_start(hdev, 0); if (ret) { hid_err(hdev, "failed to start HID HW\n"); return ret; } ret = hid_hw_open(hdev); if (ret) { hid_err(hdev, "failed to open HID HW\n"); goto err_hid_stop; } ret = ft260_hid_feature_report_get(hdev, FT260_CHIP_VERSION, (u8 *)&version, sizeof(version)); if (ret < 0) { hid_err(hdev, "failed to retrieve chip version\n"); goto err_hid_close; } hid_info(hdev, "chip code: %02x%02x %02x%02x\n", version.chip_code[0], version.chip_code[1], version.chip_code[2], version.chip_code[3]); ret = ft260_is_interface_enabled(hdev); if (ret <= 0) goto err_hid_close; hid_info(hdev, "USB HID v%x.%02x Device [%s] on %s\n", hdev->version >> 8, hdev->version & 0xff, hdev->name, hdev->phys); hid_set_drvdata(hdev, dev); dev->hdev = hdev; dev->adap.owner = THIS_MODULE; dev->adap.class = I2C_CLASS_HWMON; dev->adap.algo = &ft260_i2c_algo; dev->adap.quirks = &ft260_i2c_quirks; dev->adap.dev.parent = &hdev->dev; snprintf(dev->adap.name, sizeof(dev->adap.name), "FT260 usb-i2c bridge"); mutex_init(&dev->lock); init_completion(&dev->wait); ret = ft260_xfer_status(dev, FT260_I2C_STATUS_BUS_BUSY); if (ret) ft260_i2c_reset(hdev); i2c_set_adapdata(&dev->adap, dev); ret = i2c_add_adapter(&dev->adap); if (ret) { hid_err(hdev, "failed to add i2c adapter\n"); goto err_hid_close; } ret = sysfs_create_group(&hdev->dev.kobj, &ft260_attr_group); if (ret < 0) { hid_err(hdev, "failed to create sysfs attrs\n"); goto err_i2c_free; } return 0; err_i2c_free: i2c_del_adapter(&dev->adap); err_hid_close: hid_hw_close(hdev); err_hid_stop: hid_hw_stop(hdev); return ret; } static void ft260_remove(struct hid_device *hdev) { struct ft260_device *dev = hid_get_drvdata(hdev); if (!dev) return; sysfs_remove_group(&hdev->dev.kobj, &ft260_attr_group); i2c_del_adapter(&dev->adap); hid_hw_close(hdev); hid_hw_stop(hdev); } static int ft260_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct ft260_device *dev = hid_get_drvdata(hdev); struct ft260_i2c_input_report *xfer = (void *)data; if (xfer->report >= FT260_I2C_REPORT_MIN && xfer->report <= FT260_I2C_REPORT_MAX) { ft260_dbg("i2c resp: rep %#02x len %d\n", xfer->report, xfer->length); if ((dev->read_buf == NULL) || (xfer->length > dev->read_len - dev->read_idx)) { hid_err(hdev, "unexpected report %#02x, length %d\n", xfer->report, xfer->length); return -1; } memcpy(&dev->read_buf[dev->read_idx], &xfer->data, xfer->length); dev->read_idx += xfer->length; if (dev->read_idx == dev->read_len) complete(&dev->wait); } else { hid_err(hdev, "unhandled report %#02x\n", xfer->report); } return 0; } static struct hid_driver ft260_driver = { .name = "ft260", .id_table = ft260_devices, .probe = ft260_probe, .remove = ft260_remove, .raw_event = ft260_raw_event, }; module_hid_driver(ft260_driver); MODULE_DESCRIPTION("FTDI FT260 USB HID to I2C host bridge"); MODULE_AUTHOR("Michael Zaidman <michael.zaidman@gmail.com>"); MODULE_LICENSE("GPL v2");
8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 /* * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/rbtree.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <rdma/iw_cm.h> #include <rdma/ib_addr.h> #include <rdma/iw_portmap.h> #include <rdma/rdma_netlink.h> #include "iwcm.h" MODULE_AUTHOR("Tom Tucker"); MODULE_DESCRIPTION("iWARP CM"); MODULE_LICENSE("Dual BSD/GPL"); static const char * const iwcm_rej_reason_strs[] = { [ECONNRESET] = "reset by remote host", [ECONNREFUSED] = "refused by remote application", [ETIMEDOUT] = "setup timeout", }; const char *__attribute_const__ iwcm_reject_msg(int reason) { size_t index; /* iWARP uses negative errnos */ index = -reason; if (index < ARRAY_SIZE(iwcm_rej_reason_strs) && iwcm_rej_reason_strs[index]) return iwcm_rej_reason_strs[index]; else return "unrecognized reason"; } EXPORT_SYMBOL(iwcm_reject_msg); static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} }; static struct workqueue_struct *iwcm_wq; struct iwcm_work { struct work_struct work; struct iwcm_id_private *cm_id; struct list_head list; struct iw_cm_event event; struct list_head free_list; }; static unsigned int default_backlog = 256; static struct ctl_table_header *iwcm_ctl_table_hdr; static struct ctl_table iwcm_ctl_table[] = { { .procname = "default_backlog", .data = &default_backlog, .maxlen = sizeof(default_backlog), .mode = 0644, .proc_handler = proc_dointvec, }, }; /* * The following services provide a mechanism for pre-allocating iwcm_work * elements. The design pre-allocates them based on the cm_id type: * LISTENING IDS: Get enough elements preallocated to handle the * listen backlog. * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE * * Allocating them in connect and listen avoids having to deal * with allocation failures on the event upcall from the provider (which * is called in the interrupt context). * * One exception is when creating the cm_id for incoming connection requests. * There are two cases: * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If * the backlog is exceeded, then no more connection request events will * be processed. cm_event_handler() returns -ENOMEM in this case. Its up * to the provider to reject the connection request. * 2) in the connection request workqueue handler, cm_conn_req_handler(). * If work elements cannot be allocated for the new connect request cm_id, * then IWCM will call the provider reject method. This is ok since * cm_conn_req_handler() runs in the workqueue thread context. */ static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv) { struct iwcm_work *work; if (list_empty(&cm_id_priv->work_free_list)) return NULL; work = list_first_entry(&cm_id_priv->work_free_list, struct iwcm_work, free_list); list_del_init(&work->free_list); return work; } static void put_work(struct iwcm_work *work) { list_add(&work->free_list, &work->cm_id->work_free_list); } static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv) { struct list_head *e, *tmp; list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) { list_del(e); kfree(list_entry(e, struct iwcm_work, free_list)); } } static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count) { struct iwcm_work *work; BUG_ON(!list_empty(&cm_id_priv->work_free_list)); while (count--) { work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL); if (!work) { dealloc_work_entries(cm_id_priv); return -ENOMEM; } work->cm_id = cm_id_priv; INIT_LIST_HEAD(&work->list); put_work(work); } return 0; } /* * Save private data from incoming connection requests to * iw_cm_event, so the low level driver doesn't have to. Adjust * the event ptr to point to the local copy. */ static int copy_private_data(struct iw_cm_event *event) { void *p; p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC); if (!p) return -ENOMEM; event->private_data = p; return 0; } static void free_cm_id(struct iwcm_id_private *cm_id_priv) { dealloc_work_entries(cm_id_priv); kfree(cm_id_priv); } /* * Release a reference on cm_id. If the last reference is being * released, free the cm_id and return 'true'. */ static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) { if (refcount_dec_and_test(&cm_id_priv->refcount)) { BUG_ON(!list_empty(&cm_id_priv->work_list)); free_cm_id(cm_id_priv); return true; } return false; } static void add_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); refcount_inc(&cm_id_priv->refcount); } static void rem_ref(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); (void)iwcm_deref_id(cm_id_priv); } static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event); struct iw_cm_id *iw_create_cm_id(struct ib_device *device, iw_cm_handler cm_handler, void *context) { struct iwcm_id_private *cm_id_priv; cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL); if (!cm_id_priv) return ERR_PTR(-ENOMEM); cm_id_priv->state = IW_CM_STATE_IDLE; cm_id_priv->id.device = device; cm_id_priv->id.cm_handler = cm_handler; cm_id_priv->id.context = context; cm_id_priv->id.event_handler = cm_event_handler; cm_id_priv->id.add_ref = add_ref; cm_id_priv->id.rem_ref = rem_ref; spin_lock_init(&cm_id_priv->lock); refcount_set(&cm_id_priv->refcount, 1); init_waitqueue_head(&cm_id_priv->connect_wait); init_completion(&cm_id_priv->destroy_comp); INIT_LIST_HEAD(&cm_id_priv->work_list); INIT_LIST_HEAD(&cm_id_priv->work_free_list); return &cm_id_priv->id; } EXPORT_SYMBOL(iw_create_cm_id); static int iwcm_modify_qp_err(struct ib_qp *qp) { struct ib_qp_attr qp_attr; if (!qp) return -EINVAL; qp_attr.qp_state = IB_QPS_ERR; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * This is really the RDMAC CLOSING state. It is most similar to the * IB SQD QP state. */ static int iwcm_modify_qp_sqd(struct ib_qp *qp) { struct ib_qp_attr qp_attr; BUG_ON(qp == NULL); qp_attr.qp_state = IB_QPS_SQD; return ib_modify_qp(qp, &qp_attr, IB_QP_STATE); } /* * CM_ID <-- CLOSING * * Block if a passive or active connection is currently being processed. Then * process the event as follows: * - If we are ESTABLISHED, move to CLOSING and modify the QP state * based on the abrupt flag * - If the connection is already in the CLOSING or IDLE state, the peer is * disconnecting concurrently with us and we've already seen the * DISCONNECT event -- ignore the request and return 0 * - Disconnect on a listening endpoint returns -EINVAL */ int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* Wait if we're currently in a connect or accept downcall */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_CLOSING; /* QP could be <nul> for user-mode client */ if (cm_id_priv->qp) qp = cm_id_priv->qp; else ret = -EINVAL; break; case IW_CM_STATE_LISTEN: ret = -EINVAL; break; case IW_CM_STATE_CLOSING: /* remote peer closed first */ case IW_CM_STATE_IDLE: /* accept or connect returned !0 */ break; case IW_CM_STATE_CONN_RECV: /* * App called disconnect before/without calling accept after * connect_request event delivered. */ break; case IW_CM_STATE_CONN_SENT: /* Can only get here if wait above fails */ default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) { if (abrupt) ret = iwcm_modify_qp_err(qp); else ret = iwcm_modify_qp_sqd(qp); /* * If both sides are disconnecting the QP could * already be in ERR or SQD states */ ret = 0; } return ret; } EXPORT_SYMBOL(iw_cm_disconnect); /* * CM_ID <-- DESTROYING * * Clean up all resources associated with the connection and release * the initial reference taken by iw_create_cm_id. * * Returns true if and only if the last cm_id_priv reference has been dropped. */ static bool destroy_cm_id(struct iw_cm_id *cm_id) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; unsigned long flags; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); /* * Wait if we're currently in a connect or accept downcall. A * listening endpoint should never block here. */ wait_event(cm_id_priv->connect_wait, !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags)); /* * Since we're deleting the cm_id, drop any events that * might arrive before the last dereference. */ set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; switch (cm_id_priv->state) { case IW_CM_STATE_LISTEN: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* destroy the listening endpoint */ cm_id->device->ops.iw_destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* Abrupt close of the connection */ (void)iwcm_modify_qp_err(qp); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_IDLE: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_DESTROYING; break; case IW_CM_STATE_CONN_RECV: /* * App called destroy before/without calling accept after * receiving connection request event notification or * returned non zero from the event callback function. * In either case, must tell the provider to reject. */ cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id->device->ops.iw_reject(cm_id, NULL, 0); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_DESTROYING: default: BUG(); break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id_priv->id.device->ops.iw_rem_ref(qp); if (cm_id->mapped) { iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr); iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM); } return iwcm_deref_id(cm_id_priv); } /* * This function is only called by the application thread and cannot * be called by the event thread. The function will wait for all * references to be released on the cm_id and then kfree the cm_id * object. */ void iw_destroy_cm_id(struct iw_cm_id *cm_id) { if (!destroy_cm_id(cm_id)) flush_workqueue(iwcm_wq); } EXPORT_SYMBOL(iw_destroy_cm_id); /** * iw_cm_check_wildcard - If IP address is 0 then use original * @pm_addr: sockaddr containing the ip to check for wildcard * @cm_addr: sockaddr containing the actual IP address * @cm_outaddr: sockaddr to set IP addr which leaving port * * Checks the pm_addr for wildcard and then sets cm_outaddr's * IP to the actual (cm_addr). */ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, struct sockaddr_storage *cm_addr, struct sockaddr_storage *cm_outaddr) { if (pm_addr->ss_family == AF_INET) { struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) { struct sockaddr_in *cm4_addr = (struct sockaddr_in *)cm_addr; struct sockaddr_in *cm4_outaddr = (struct sockaddr_in *)cm_outaddr; cm4_outaddr->sin_addr = cm4_addr->sin_addr; } } else { struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr; if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) { struct sockaddr_in6 *cm6_addr = (struct sockaddr_in6 *)cm_addr; struct sockaddr_in6 *cm6_outaddr = (struct sockaddr_in6 *)cm_outaddr; cm6_outaddr->sin6_addr = cm6_addr->sin6_addr; } } } /** * iw_cm_map - Use portmapper to map the ports * @cm_id: connection manager pointer * @active: Indicates the active side when true * returns nonzero for error only if iwpm_create_mapinfo() fails * * Tries to add a mapping for a port using the Portmapper. If * successful in mapping the IP/Port it will check the remote * mapped IP address for a wildcard IP address and replace the * zero IP address with the remote_addr. */ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) { const char *devname = dev_name(&cm_id->device->dev); const char *ifname = cm_id->device->iw_ifname; struct iwpm_dev_data pm_reg_msg = {}; struct iwpm_sa_data pm_msg; int status; if (strlen(devname) >= sizeof(pm_reg_msg.dev_name) || strlen(ifname) >= sizeof(pm_reg_msg.if_name)) return -EINVAL; cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; strcpy(pm_reg_msg.dev_name, devname); strcpy(pm_reg_msg.if_name, ifname); if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) || !iwpm_valid_pid()) return 0; cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ? IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_IWCM); else status = iwpm_add_mapping(&pm_msg, RDMA_NL_IWCM); if (!status) { cm_id->m_local_addr = pm_msg.mapped_loc_addr; if (active) { cm_id->m_remote_addr = pm_msg.mapped_rem_addr; iw_cm_check_wildcard(&pm_msg.mapped_rem_addr, &cm_id->remote_addr, &cm_id->m_remote_addr); } } return iwpm_create_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr, RDMA_NL_IWCM, pm_msg.flags); } /* * CM_ID <-- LISTEN * * Start listening for connect requests. Generates one CONNECT_REQUEST * event for each inbound connect request. */ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); if (!backlog) backlog = default_backlog; ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: cm_id_priv->state = IW_CM_STATE_LISTEN; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, false); if (!ret) ret = cm_id->device->ops.iw_create_listen(cm_id, backlog); if (ret) cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); break; default: ret = -EINVAL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(iw_cm_listen); /* * CM_ID <-- IDLE * * Rejects an inbound connection request. No events are generated. */ int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data, u8 private_data_len) { struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->ops.iw_reject(cm_id, private_data, private_data_len); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_reject); /* * CM_ID <-- ESTABLISHED * * Accepts an inbound connection request and generates an ESTABLISHED * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block * until the ESTABLISHED event is received from the provider. */ int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; struct ib_qp *qp; unsigned long flags; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } /* Get the ib_qp given the QPN */ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id->device->ops.iw_accept(cm_id, iw_param); if (ret) { /* An error on accept precludes provider events */ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id->device->ops.iw_rem_ref(qp); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); } return ret; } EXPORT_SYMBOL(iw_cm_accept); /* * Active Side: CM_ID <-- CONN_SENT * * If successful, results in the generation of a CONNECT_REPLY * event. iw_cm_disconnect and iw_cm_destroy will block until the * CONNECT_REPLY event is received from the provider. */ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) { struct iwcm_id_private *cm_id_priv; int ret; unsigned long flags; struct ib_qp *qp = NULL; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); ret = alloc_work_entries(cm_id_priv, 4); if (ret) return ret; set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state != IW_CM_STATE_IDLE) { ret = -EINVAL; goto err; } /* Get the ib_qp given the QPN */ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { ret = -EINVAL; goto err; } cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, true); if (!ret) ret = cm_id->device->ops.iw_connect(cm_id, iw_param); if (!ret) return 0; /* success */ spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; err: spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id->device->ops.iw_rem_ref(qp); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return ret; } EXPORT_SYMBOL(iw_cm_connect); /* * Passive Side: new CM_ID <-- CONN_RECV * * Handles an inbound connect request. The function creates a new * iw_cm_id to represent the new connection and inherits the client * callback function and other attributes from the listening parent. * * The work item contains a pointer to the listen_cm_id and the event. The * listen_cm_id contains the client cm_handler, context and * device. These are copied when the device is cloned. The event * contains the new four tuple. * * An error on the child should not affect the parent, so this * function does not return a value. */ static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; struct iw_cm_id *cm_id; struct iwcm_id_private *cm_id_priv; int ret; /* * The provider should never generate a connection request * event with a bad status. */ BUG_ON(iw_event->status); cm_id = iw_create_cm_id(listen_id_priv->id.device, listen_id_priv->id.cm_handler, listen_id_priv->id.context); /* If the cm_id could not be created, ignore the request */ if (IS_ERR(cm_id)) goto out; cm_id->provider_data = iw_event->provider_data; cm_id->m_local_addr = iw_event->local_addr; cm_id->m_remote_addr = iw_event->remote_addr; cm_id->local_addr = listen_id_priv->id.local_addr; ret = iwpm_get_remote_info(&listen_id_priv->id.m_local_addr, &iw_event->remote_addr, &cm_id->remote_addr, RDMA_NL_IWCM); if (ret) { cm_id->remote_addr = iw_event->remote_addr; } else { iw_cm_check_wildcard(&listen_id_priv->id.m_local_addr, &iw_event->local_addr, &cm_id->local_addr); iw_event->local_addr = cm_id->local_addr; iw_event->remote_addr = cm_id->remote_addr; } cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); cm_id_priv->state = IW_CM_STATE_CONN_RECV; /* * We could be destroying the listening id. If so, ignore this * upcall. */ spin_lock_irqsave(&listen_id_priv->lock, flags); if (listen_id_priv->state != IW_CM_STATE_LISTEN) { spin_unlock_irqrestore(&listen_id_priv->lock, flags); iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } spin_unlock_irqrestore(&listen_id_priv->lock, flags); ret = alloc_work_entries(cm_id_priv, 3); if (ret) { iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); goto out; } /* Call the client CM handler */ ret = cm_id->cm_handler(cm_id, iw_event); if (ret) { iw_cm_reject(cm_id, NULL, 0); iw_destroy_cm_id(cm_id); } out: if (iw_event->private_data_len) kfree(iw_event->private_data); } /* * Passive Side: CM_ID <-- ESTABLISHED * * The provider generated an ESTABLISHED event which means that * the MPA negotion has completed successfully and we are now in MPA * FPDU mode. * * This event can only be received in the CONN_RECV state. If the * remote peer closed, the ESTABLISHED event would be received followed * by the CLOSE event. If the app closes, it will block until we wake * it up after processing this event. */ static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * We clear the CONNECT_WAIT bit here to allow the callback * function to call iw_cm_disconnect. Calling iw_destroy_cm_id * from a callback handler is not allowed. */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_ESTABLISHED; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * Active Side: CM_ID <-- ESTABLISHED * * The app has called connect and is waiting for the established event to * post it's requests to the server. This event will wake up anyone * blocked in iw_cm_disconnect or iw_destroy_id. */ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { struct ib_qp *qp = NULL; unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); /* * Clear the connect wait bit so a callback function calling * iw_cm_disconnect will not wait and deadlock this thread */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); if (iw_event->status == 0) { cm_id_priv->id.m_local_addr = iw_event->local_addr; cm_id_priv->id.m_remote_addr = iw_event->remote_addr; iw_event->local_addr = cm_id_priv->id.local_addr; iw_event->remote_addr = cm_id_priv->id.remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ qp = cm_id_priv->qp; cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id_priv->id.device->ops.iw_rem_ref(qp); ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); if (iw_event->private_data_len) kfree(iw_event->private_data); /* Wake up waiters on connect complete */ wake_up_all(&cm_id_priv->connect_wait); return ret; } /* * CM_ID <-- CLOSING * * If in the ESTABLISHED state, move to CLOSING. */ static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { unsigned long flags; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED) cm_id_priv->state = IW_CM_STATE_CLOSING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * CM_ID <-- IDLE * * If in the ESTBLISHED or CLOSING states, the QP will have have been * moved by the provider to the ERR state. Disassociate the CM_ID from * the QP, move to IDLE, and remove the 'connected' reference. * * If in some other state, the cm_id was destroyed asynchronously. * This is the last reference that will result in waking up * the app thread blocked in iw_destroy_cm_id. */ static int cm_close_handler(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { struct ib_qp *qp; unsigned long flags; int ret = 0, notify_event = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); qp = cm_id_priv->qp; cm_id_priv->qp = NULL; switch (cm_id_priv->state) { case IW_CM_STATE_ESTABLISHED: case IW_CM_STATE_CLOSING: cm_id_priv->state = IW_CM_STATE_IDLE; notify_event = 1; break; case IW_CM_STATE_DESTROYING: break; default: BUG(); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (qp) cm_id_priv->id.device->ops.iw_rem_ref(qp); if (notify_event) ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event); return ret; } static int process_event(struct iwcm_id_private *cm_id_priv, struct iw_cm_event *iw_event) { int ret = 0; switch (iw_event->event) { case IW_CM_EVENT_CONNECT_REQUEST: cm_conn_req_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CONNECT_REPLY: ret = cm_conn_rep_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_ESTABLISHED: ret = cm_conn_est_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_DISCONNECT: cm_disconnect_handler(cm_id_priv, iw_event); break; case IW_CM_EVENT_CLOSE: ret = cm_close_handler(cm_id_priv, iw_event); break; default: BUG(); } return ret; } /* * Process events on the work_list for the cm_id. If the callback * function requests that the cm_id be deleted, a flag is set in the * cm_id flags to indicate that when the last reference is * removed, the cm_id is to be destroyed. This is necessary to * distinguish between an object that will be destroyed by the app * thread asleep on the destroy_comp list vs. an object destroyed * here synchronously when the last reference is removed. */ static void cm_work_handler(struct work_struct *_work) { struct iwcm_work *work = container_of(_work, struct iwcm_work, work); struct iw_cm_event levent; struct iwcm_id_private *cm_id_priv = work->cm_id; unsigned long flags; int ret = 0; spin_lock_irqsave(&cm_id_priv->lock, flags); while (!list_empty(&cm_id_priv->work_list)) { work = list_first_entry(&cm_id_priv->work_list, struct iwcm_work, list); list_del_init(&work->list); levent = work->event; put_work(work); spin_unlock_irqrestore(&cm_id_priv->lock, flags); if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { ret = process_event(cm_id_priv, &levent); if (ret) WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id)); } else pr_debug("dropping event %d\n", levent.event); if (iwcm_deref_id(cm_id_priv)) return; spin_lock_irqsave(&cm_id_priv->lock, flags); } spin_unlock_irqrestore(&cm_id_priv->lock, flags); } /* * This function is called on interrupt context. Schedule events on * the iwcm_wq thread to allow callback functions to downcall into * the CM and/or block. Events are queued to a per-CM_ID * work_list. If this is the first event on the work_list, the work * element is also queued on the iwcm_wq thread. * * Each event holds a reference on the cm_id. Until the last posted * event has been delivered and processed, the cm_id cannot be * deleted. * * Returns: * 0 - the event was handled. * -ENOMEM - the event was not handled due to lack of resources. */ static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct iwcm_work *work; struct iwcm_id_private *cm_id_priv; unsigned long flags; int ret = 0; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); work = get_work(cm_id_priv); if (!work) { ret = -ENOMEM; goto out; } INIT_WORK(&work->work, cm_work_handler); work->cm_id = cm_id_priv; work->event = *iw_event; if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST || work->event.event == IW_CM_EVENT_CONNECT_REPLY) && work->event.private_data_len) { ret = copy_private_data(&work->event); if (ret) { put_work(work); goto out; } } refcount_inc(&cm_id_priv->refcount); list_add_tail(&work->list, &cm_id_priv->work_list); queue_work(iwcm_wq, &work->work); out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_READ; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { unsigned long flags; int ret; spin_lock_irqsave(&cm_id_priv->lock, flags); switch (cm_id_priv->state) { case IW_CM_STATE_IDLE: case IW_CM_STATE_CONN_SENT: case IW_CM_STATE_CONN_RECV: case IW_CM_STATE_ESTABLISHED: *qp_attr_mask = 0; ret = 0; break; default: ret = -EINVAL; break; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct iwcm_id_private *cm_id_priv; int ret; cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); switch (qp_attr->qp_state) { case IB_QPS_INIT: case IB_QPS_RTR: ret = iwcm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask); break; case IB_QPS_RTS: ret = iwcm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: ret = -EINVAL; break; } return ret; } EXPORT_SYMBOL(iw_cm_init_qp_attr); static int __init iw_cm_init(void) { int ret; ret = iwpm_init(RDMA_NL_IWCM); if (ret) return ret; iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) goto err_alloc; iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", iwcm_ctl_table); if (!iwcm_ctl_table_hdr) { pr_err("iw_cm: couldn't register sysctl paths\n"); goto err_sysctl; } rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); return 0; err_sysctl: destroy_workqueue(iwcm_wq); err_alloc: iwpm_exit(RDMA_NL_IWCM); return -ENOMEM; } static void __exit iw_cm_cleanup(void) { rdma_nl_unregister(RDMA_NL_IWCM); unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); iwpm_exit(RDMA_NL_IWCM); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2); module_init(iw_cm_init); module_exit(iw_cm_cleanup);
20 3 20 20 20 20 3 20 17 3 3 20 20 20 20 20 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 // SPDX-License-Identifier: GPL-2.0+ /* * comedi/comedi_fops.c * comedi kernel module * * COMEDI - Linux Control and Measurement Device Interface * Copyright (C) 1997-2007 David A. Schleef <ds@schleef.org> * compat ioctls: * Author: Ian Abbott, MEV Ltd. <abbotti@mev.co.uk> * Copyright (C) 2007 MEV Ltd. <http://www.mev.co.uk/> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fcntl.h> #include <linux/delay.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/device.h> #include <linux/fs.h> #include <linux/comedi/comedidev.h> #include <linux/cdev.h> #include <linux/io.h> #include <linux/uaccess.h> #include <linux/compat.h> #include "comedi_internal.h" /* * comedi_subdevice "runflags" * COMEDI_SRF_RT: DEPRECATED: command is running real-time * COMEDI_SRF_ERROR: indicates an COMEDI_CB_ERROR event has occurred * since the last command was started * COMEDI_SRF_RUNNING: command is running * COMEDI_SRF_FREE_SPRIV: free s->private on detach * * COMEDI_SRF_BUSY_MASK: runflags that indicate the subdevice is "busy" */ #define COMEDI_SRF_RT BIT(1) #define COMEDI_SRF_ERROR BIT(2) #define COMEDI_SRF_RUNNING BIT(27) #define COMEDI_SRF_FREE_SPRIV BIT(31) #define COMEDI_SRF_BUSY_MASK (COMEDI_SRF_ERROR | COMEDI_SRF_RUNNING) /** * struct comedi_file - Per-file private data for COMEDI device * @dev: COMEDI device. * @read_subdev: Current "read" subdevice. * @write_subdev: Current "write" subdevice. * @last_detach_count: Last known detach count. * @last_attached: Last known attached/detached state. */ struct comedi_file { struct comedi_device *dev; struct comedi_subdevice *read_subdev; struct comedi_subdevice *write_subdev; unsigned int last_detach_count; unsigned int last_attached:1; }; #define COMEDI_NUM_MINORS 0x100 #define COMEDI_NUM_SUBDEVICE_MINORS \ (COMEDI_NUM_MINORS - COMEDI_NUM_BOARD_MINORS) static unsigned short comedi_num_legacy_minors; module_param(comedi_num_legacy_minors, ushort, 0444); MODULE_PARM_DESC(comedi_num_legacy_minors, "number of comedi minor devices to reserve for non-auto-configured devices (default 0)" ); unsigned int comedi_default_buf_size_kb = CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB; module_param(comedi_default_buf_size_kb, uint, 0644); MODULE_PARM_DESC(comedi_default_buf_size_kb, "default asynchronous buffer size in KiB (default " __MODULE_STRING(CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB) ")"); unsigned int comedi_default_buf_maxsize_kb = CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB; module_param(comedi_default_buf_maxsize_kb, uint, 0644); MODULE_PARM_DESC(comedi_default_buf_maxsize_kb, "default maximum size of asynchronous buffer in KiB (default " __MODULE_STRING(CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB) ")"); static DEFINE_MUTEX(comedi_board_minor_table_lock); static struct comedi_device *comedi_board_minor_table[COMEDI_NUM_BOARD_MINORS]; static DEFINE_MUTEX(comedi_subdevice_minor_table_lock); /* Note: indexed by minor - COMEDI_NUM_BOARD_MINORS. */ static struct comedi_subdevice *comedi_subdevice_minor_table[COMEDI_NUM_SUBDEVICE_MINORS]; static struct cdev comedi_cdev; static void comedi_device_init(struct comedi_device *dev) { kref_init(&dev->refcount); spin_lock_init(&dev->spinlock); mutex_init(&dev->mutex); init_rwsem(&dev->attach_lock); dev->minor = -1; } static void comedi_dev_kref_release(struct kref *kref) { struct comedi_device *dev = container_of(kref, struct comedi_device, refcount); mutex_destroy(&dev->mutex); put_device(dev->class_dev); kfree(dev); } /** * comedi_dev_put() - Release a use of a COMEDI device * @dev: COMEDI device. * * Must be called when a user of a COMEDI device is finished with it. * When the last user of the COMEDI device calls this function, the * COMEDI device is destroyed. * * Return: 1 if the COMEDI device is destroyed by this call or @dev is * NULL, otherwise return 0. Callers must not assume the COMEDI * device is still valid if this function returns 0. */ int comedi_dev_put(struct comedi_device *dev) { if (dev) return kref_put(&dev->refcount, comedi_dev_kref_release); return 1; } EXPORT_SYMBOL_GPL(comedi_dev_put); static struct comedi_device *comedi_dev_get(struct comedi_device *dev) { if (dev) kref_get(&dev->refcount); return dev; } static void comedi_device_cleanup(struct comedi_device *dev) { struct module *driver_module = NULL; if (!dev) return; mutex_lock(&dev->mutex); if (dev->attached) driver_module = dev->driver->module; comedi_device_detach(dev); if (driver_module && dev->use_count) module_put(driver_module); mutex_unlock(&dev->mutex); } static bool comedi_clear_board_dev(struct comedi_device *dev) { unsigned int i = dev->minor; bool cleared = false; lockdep_assert_held(&dev->mutex); mutex_lock(&comedi_board_minor_table_lock); if (dev == comedi_board_minor_table[i]) { comedi_board_minor_table[i] = NULL; cleared = true; } mutex_unlock(&comedi_board_minor_table_lock); return cleared; } static struct comedi_device *comedi_clear_board_minor(unsigned int minor) { struct comedi_device *dev; mutex_lock(&comedi_board_minor_table_lock); dev = comedi_board_minor_table[minor]; comedi_board_minor_table[minor] = NULL; mutex_unlock(&comedi_board_minor_table_lock); return dev; } static struct comedi_subdevice * comedi_subdevice_from_minor(const struct comedi_device *dev, unsigned int minor) { struct comedi_subdevice *s; unsigned int i = minor - COMEDI_NUM_BOARD_MINORS; mutex_lock(&comedi_subdevice_minor_table_lock); s = comedi_subdevice_minor_table[i]; if (s && s->device != dev) s = NULL; mutex_unlock(&comedi_subdevice_minor_table_lock); return s; } static struct comedi_device *comedi_dev_get_from_board_minor(unsigned int minor) { struct comedi_device *dev; mutex_lock(&comedi_board_minor_table_lock); dev = comedi_dev_get(comedi_board_minor_table[minor]); mutex_unlock(&comedi_board_minor_table_lock); return dev; } static struct comedi_device * comedi_dev_get_from_subdevice_minor(unsigned int minor) { struct comedi_device *dev; struct comedi_subdevice *s; unsigned int i = minor - COMEDI_NUM_BOARD_MINORS; mutex_lock(&comedi_subdevice_minor_table_lock); s = comedi_subdevice_minor_table[i]; dev = comedi_dev_get(s ? s->device : NULL); mutex_unlock(&comedi_subdevice_minor_table_lock); return dev; } /** * comedi_dev_get_from_minor() - Get COMEDI device by minor device number * @minor: Minor device number. * * Finds the COMEDI device associated with the minor device number, if any, * and increments its reference count. The COMEDI device is prevented from * being freed until a matching call is made to comedi_dev_put(). * * Return: A pointer to the COMEDI device if it exists, with its usage * reference incremented. Return NULL if no COMEDI device exists with the * specified minor device number. */ struct comedi_device *comedi_dev_get_from_minor(unsigned int minor) { if (minor < COMEDI_NUM_BOARD_MINORS) return comedi_dev_get_from_board_minor(minor); return comedi_dev_get_from_subdevice_minor(minor); } EXPORT_SYMBOL_GPL(comedi_dev_get_from_minor); static struct comedi_subdevice * comedi_read_subdevice(const struct comedi_device *dev, unsigned int minor) { struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (minor >= COMEDI_NUM_BOARD_MINORS) { s = comedi_subdevice_from_minor(dev, minor); if (!s || (s->subdev_flags & SDF_CMD_READ)) return s; } return dev->read_subdev; } static struct comedi_subdevice * comedi_write_subdevice(const struct comedi_device *dev, unsigned int minor) { struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (minor >= COMEDI_NUM_BOARD_MINORS) { s = comedi_subdevice_from_minor(dev, minor); if (!s || (s->subdev_flags & SDF_CMD_WRITE)) return s; } return dev->write_subdev; } static void comedi_file_reset(struct file *file) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi_subdevice *s, *read_s, *write_s; unsigned int minor = iminor(file_inode(file)); read_s = dev->read_subdev; write_s = dev->write_subdev; if (minor >= COMEDI_NUM_BOARD_MINORS) { s = comedi_subdevice_from_minor(dev, minor); if (!s || s->subdev_flags & SDF_CMD_READ) read_s = s; if (!s || s->subdev_flags & SDF_CMD_WRITE) write_s = s; } cfp->last_attached = dev->attached; cfp->last_detach_count = dev->detach_count; WRITE_ONCE(cfp->read_subdev, read_s); WRITE_ONCE(cfp->write_subdev, write_s); } static void comedi_file_check(struct file *file) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; if (cfp->last_attached != dev->attached || cfp->last_detach_count != dev->detach_count) comedi_file_reset(file); } static struct comedi_subdevice *comedi_file_read_subdevice(struct file *file) { struct comedi_file *cfp = file->private_data; comedi_file_check(file); return READ_ONCE(cfp->read_subdev); } static struct comedi_subdevice *comedi_file_write_subdevice(struct file *file) { struct comedi_file *cfp = file->private_data; comedi_file_check(file); return READ_ONCE(cfp->write_subdev); } static int resize_async_buffer(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int new_size) { struct comedi_async *async = s->async; int retval; lockdep_assert_held(&dev->mutex); if (new_size > async->max_bufsize) return -EPERM; if (s->busy) { dev_dbg(dev->class_dev, "subdevice is busy, cannot resize buffer\n"); return -EBUSY; } if (comedi_buf_is_mmapped(s)) { dev_dbg(dev->class_dev, "subdevice is mmapped, cannot resize buffer\n"); return -EBUSY; } /* make sure buffer is an integral number of pages (we round up) */ new_size = (new_size + PAGE_SIZE - 1) & PAGE_MASK; retval = comedi_buf_alloc(dev, s, new_size); if (retval < 0) return retval; if (s->buf_change) { retval = s->buf_change(dev, s); if (retval < 0) return retval; } dev_dbg(dev->class_dev, "subd %d buffer resized to %i bytes\n", s->index, async->prealloc_bufsz); return 0; } /* sysfs attribute files */ static ssize_t max_read_buffer_kb_show(struct device *csdev, struct device_attribute *attr, char *buf) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size = 0; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_read_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_READ) && s->async) size = s->async->max_bufsize / 1024; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return sysfs_emit(buf, "%u\n", size); } static ssize_t max_read_buffer_kb_store(struct device *csdev, struct device_attribute *attr, const char *buf, size_t count) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size; int err; err = kstrtouint(buf, 10, &size); if (err) return err; if (size > (UINT_MAX / 1024)) return -EINVAL; size *= 1024; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_read_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_READ) && s->async) s->async->max_bufsize = size; else err = -EINVAL; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return err ? err : count; } static DEVICE_ATTR_RW(max_read_buffer_kb); static ssize_t read_buffer_kb_show(struct device *csdev, struct device_attribute *attr, char *buf) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size = 0; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_read_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_READ) && s->async) size = s->async->prealloc_bufsz / 1024; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return sysfs_emit(buf, "%u\n", size); } static ssize_t read_buffer_kb_store(struct device *csdev, struct device_attribute *attr, const char *buf, size_t count) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size; int err; err = kstrtouint(buf, 10, &size); if (err) return err; if (size > (UINT_MAX / 1024)) return -EINVAL; size *= 1024; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_read_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_READ) && s->async) err = resize_async_buffer(dev, s, size); else err = -EINVAL; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return err ? err : count; } static DEVICE_ATTR_RW(read_buffer_kb); static ssize_t max_write_buffer_kb_show(struct device *csdev, struct device_attribute *attr, char *buf) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size = 0; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_write_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_WRITE) && s->async) size = s->async->max_bufsize / 1024; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return sysfs_emit(buf, "%u\n", size); } static ssize_t max_write_buffer_kb_store(struct device *csdev, struct device_attribute *attr, const char *buf, size_t count) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size; int err; err = kstrtouint(buf, 10, &size); if (err) return err; if (size > (UINT_MAX / 1024)) return -EINVAL; size *= 1024; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_write_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_WRITE) && s->async) s->async->max_bufsize = size; else err = -EINVAL; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return err ? err : count; } static DEVICE_ATTR_RW(max_write_buffer_kb); static ssize_t write_buffer_kb_show(struct device *csdev, struct device_attribute *attr, char *buf) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size = 0; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_write_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_WRITE) && s->async) size = s->async->prealloc_bufsz / 1024; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return sysfs_emit(buf, "%u\n", size); } static ssize_t write_buffer_kb_store(struct device *csdev, struct device_attribute *attr, const char *buf, size_t count) { unsigned int minor = MINOR(csdev->devt); struct comedi_device *dev; struct comedi_subdevice *s; unsigned int size; int err; err = kstrtouint(buf, 10, &size); if (err) return err; if (size > (UINT_MAX / 1024)) return -EINVAL; size *= 1024; dev = comedi_dev_get_from_minor(minor); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); s = comedi_write_subdevice(dev, minor); if (s && (s->subdev_flags & SDF_CMD_WRITE) && s->async) err = resize_async_buffer(dev, s, size); else err = -EINVAL; mutex_unlock(&dev->mutex); comedi_dev_put(dev); return err ? err : count; } static DEVICE_ATTR_RW(write_buffer_kb); static struct attribute *comedi_dev_attrs[] = { &dev_attr_max_read_buffer_kb.attr, &dev_attr_read_buffer_kb.attr, &dev_attr_max_write_buffer_kb.attr, &dev_attr_write_buffer_kb.attr, NULL, }; ATTRIBUTE_GROUPS(comedi_dev); static const struct class comedi_class = { .name = "comedi", .dev_groups = comedi_dev_groups, }; static void comedi_free_board_dev(struct comedi_device *dev) { if (dev) { comedi_device_cleanup(dev); if (dev->class_dev) { device_destroy(&comedi_class, MKDEV(COMEDI_MAJOR, dev->minor)); } comedi_dev_put(dev); } } static void __comedi_clear_subdevice_runflags(struct comedi_subdevice *s, unsigned int bits) { s->runflags &= ~bits; } static void __comedi_set_subdevice_runflags(struct comedi_subdevice *s, unsigned int bits) { s->runflags |= bits; } static void comedi_update_subdevice_runflags(struct comedi_subdevice *s, unsigned int mask, unsigned int bits) { unsigned long flags; spin_lock_irqsave(&s->spin_lock, flags); __comedi_clear_subdevice_runflags(s, mask); __comedi_set_subdevice_runflags(s, bits & mask); spin_unlock_irqrestore(&s->spin_lock, flags); } static unsigned int __comedi_get_subdevice_runflags(struct comedi_subdevice *s) { return s->runflags; } static unsigned int comedi_get_subdevice_runflags(struct comedi_subdevice *s) { unsigned long flags; unsigned int runflags; spin_lock_irqsave(&s->spin_lock, flags); runflags = __comedi_get_subdevice_runflags(s); spin_unlock_irqrestore(&s->spin_lock, flags); return runflags; } static bool comedi_is_runflags_running(unsigned int runflags) { return runflags & COMEDI_SRF_RUNNING; } static bool comedi_is_runflags_in_error(unsigned int runflags) { return runflags & COMEDI_SRF_ERROR; } /** * comedi_is_subdevice_running() - Check if async command running on subdevice * @s: COMEDI subdevice. * * Return: %true if an asynchronous COMEDI command is active on the * subdevice, else %false. */ bool comedi_is_subdevice_running(struct comedi_subdevice *s) { unsigned int runflags = comedi_get_subdevice_runflags(s); return comedi_is_runflags_running(runflags); } EXPORT_SYMBOL_GPL(comedi_is_subdevice_running); static bool __comedi_is_subdevice_running(struct comedi_subdevice *s) { unsigned int runflags = __comedi_get_subdevice_runflags(s); return comedi_is_runflags_running(runflags); } bool comedi_can_auto_free_spriv(struct comedi_subdevice *s) { unsigned int runflags = __comedi_get_subdevice_runflags(s); return runflags & COMEDI_SRF_FREE_SPRIV; } /** * comedi_set_spriv_auto_free() - Mark subdevice private data as freeable * @s: COMEDI subdevice. * * Mark the subdevice as having a pointer to private data that can be * automatically freed when the COMEDI device is detached from the low-level * driver. */ void comedi_set_spriv_auto_free(struct comedi_subdevice *s) { __comedi_set_subdevice_runflags(s, COMEDI_SRF_FREE_SPRIV); } EXPORT_SYMBOL_GPL(comedi_set_spriv_auto_free); /** * comedi_alloc_spriv - Allocate memory for the subdevice private data * @s: COMEDI subdevice. * @size: Size of the memory to allocate. * * Allocate memory for the subdevice private data and point @s->private * to it. The memory will be freed automatically when the COMEDI device * is detached from the low-level driver. * * Return: A pointer to the allocated memory @s->private on success. * Return NULL on failure. */ void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size) { s->private = kzalloc(size, GFP_KERNEL); if (s->private) comedi_set_spriv_auto_free(s); return s->private; } EXPORT_SYMBOL_GPL(comedi_alloc_spriv); /* * This function restores a subdevice to an idle state. */ static void do_become_nonbusy(struct comedi_device *dev, struct comedi_subdevice *s) { struct comedi_async *async = s->async; lockdep_assert_held(&dev->mutex); comedi_update_subdevice_runflags(s, COMEDI_SRF_RUNNING, 0); if (async) { comedi_buf_reset(s); async->inttrig = NULL; kfree(async->cmd.chanlist); async->cmd.chanlist = NULL; s->busy = NULL; wake_up_interruptible_all(&async->wait_head); } else { dev_err(dev->class_dev, "BUG: (?) %s called with async=NULL\n", __func__); s->busy = NULL; } } static int do_cancel(struct comedi_device *dev, struct comedi_subdevice *s) { int ret = 0; lockdep_assert_held(&dev->mutex); if (comedi_is_subdevice_running(s) && s->cancel) ret = s->cancel(dev, s); do_become_nonbusy(dev, s); return ret; } void comedi_device_cancel_all(struct comedi_device *dev) { struct comedi_subdevice *s; int i; lockdep_assert_held(&dev->mutex); if (!dev->attached) return; for (i = 0; i < dev->n_subdevices; i++) { s = &dev->subdevices[i]; if (s->async) do_cancel(dev, s); } } static int is_device_busy(struct comedi_device *dev) { struct comedi_subdevice *s; int i; lockdep_assert_held(&dev->mutex); if (!dev->attached) return 0; for (i = 0; i < dev->n_subdevices; i++) { s = &dev->subdevices[i]; if (s->busy) return 1; if (s->async && comedi_buf_is_mmapped(s)) return 1; } return 0; } /* * COMEDI_DEVCONFIG ioctl * attaches (and configures) or detaches a legacy device * * arg: * pointer to comedi_devconfig structure (NULL if detaching) * * reads: * comedi_devconfig structure (if attaching) * * writes: * nothing */ static int do_devconfig_ioctl(struct comedi_device *dev, struct comedi_devconfig __user *arg) { struct comedi_devconfig it; lockdep_assert_held(&dev->mutex); if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!arg) { if (is_device_busy(dev)) return -EBUSY; if (dev->attached) { struct module *driver_module = dev->driver->module; comedi_device_detach(dev); module_put(driver_module); } return 0; } if (copy_from_user(&it, arg, sizeof(it))) return -EFAULT; it.board_name[COMEDI_NAMELEN - 1] = 0; if (it.options[COMEDI_DEVCONF_AUX_DATA_LENGTH]) { dev_warn(dev->class_dev, "comedi_config --init_data is deprecated\n"); return -EINVAL; } if (dev->minor >= comedi_num_legacy_minors) /* don't re-use dynamically allocated comedi devices */ return -EBUSY; /* This increments the driver module count on success. */ return comedi_device_attach(dev, &it); } /* * COMEDI_BUFCONFIG ioctl * buffer configuration * * arg: * pointer to comedi_bufconfig structure * * reads: * comedi_bufconfig structure * * writes: * modified comedi_bufconfig structure */ static int do_bufconfig_ioctl(struct comedi_device *dev, struct comedi_bufconfig __user *arg) { struct comedi_bufconfig bc; struct comedi_async *async; struct comedi_subdevice *s; int retval = 0; lockdep_assert_held(&dev->mutex); if (copy_from_user(&bc, arg, sizeof(bc))) return -EFAULT; if (bc.subdevice >= dev->n_subdevices) return -EINVAL; s = &dev->subdevices[bc.subdevice]; async = s->async; if (!async) { dev_dbg(dev->class_dev, "subdevice does not have async capability\n"); bc.size = 0; bc.maximum_size = 0; goto copyback; } if (bc.maximum_size) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; async->max_bufsize = bc.maximum_size; } if (bc.size) { retval = resize_async_buffer(dev, s, bc.size); if (retval < 0) return retval; } bc.size = async->prealloc_bufsz; bc.maximum_size = async->max_bufsize; copyback: if (copy_to_user(arg, &bc, sizeof(bc))) return -EFAULT; return 0; } /* * COMEDI_DEVINFO ioctl * device info * * arg: * pointer to comedi_devinfo structure * * reads: * nothing * * writes: * comedi_devinfo structure */ static int do_devinfo_ioctl(struct comedi_device *dev, struct comedi_devinfo __user *arg, struct file *file) { struct comedi_subdevice *s; struct comedi_devinfo devinfo; lockdep_assert_held(&dev->mutex); memset(&devinfo, 0, sizeof(devinfo)); /* fill devinfo structure */ devinfo.version_code = COMEDI_VERSION_CODE; devinfo.n_subdevs = dev->n_subdevices; strscpy(devinfo.driver_name, dev->driver->driver_name, COMEDI_NAMELEN); strscpy(devinfo.board_name, dev->board_name, COMEDI_NAMELEN); s = comedi_file_read_subdevice(file); if (s) devinfo.read_subdevice = s->index; else devinfo.read_subdevice = -1; s = comedi_file_write_subdevice(file); if (s) devinfo.write_subdevice = s->index; else devinfo.write_subdevice = -1; if (copy_to_user(arg, &devinfo, sizeof(devinfo))) return -EFAULT; return 0; } /* * COMEDI_SUBDINFO ioctl * subdevices info * * arg: * pointer to array of comedi_subdinfo structures * * reads: * nothing * * writes: * array of comedi_subdinfo structures */ static int do_subdinfo_ioctl(struct comedi_device *dev, struct comedi_subdinfo __user *arg, void *file) { int ret, i; struct comedi_subdinfo *tmp, *us; struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); tmp = kcalloc(dev->n_subdevices, sizeof(*tmp), GFP_KERNEL); if (!tmp) return -ENOMEM; /* fill subdinfo structs */ for (i = 0; i < dev->n_subdevices; i++) { s = &dev->subdevices[i]; us = tmp + i; us->type = s->type; us->n_chan = s->n_chan; us->subd_flags = s->subdev_flags; if (comedi_is_subdevice_running(s)) us->subd_flags |= SDF_RUNNING; #define TIMER_nanosec 5 /* backwards compatibility */ us->timer_type = TIMER_nanosec; us->len_chanlist = s->len_chanlist; us->maxdata = s->maxdata; if (s->range_table) { us->range_type = (i << 24) | (0 << 16) | (s->range_table->length); } else { us->range_type = 0; /* XXX */ } if (s->busy) us->subd_flags |= SDF_BUSY; if (s->busy == file) us->subd_flags |= SDF_BUSY_OWNER; if (s->lock) us->subd_flags |= SDF_LOCKED; if (s->lock == file) us->subd_flags |= SDF_LOCK_OWNER; if (!s->maxdata && s->maxdata_list) us->subd_flags |= SDF_MAXDATA; if (s->range_table_list) us->subd_flags |= SDF_RANGETYPE; if (s->do_cmd) us->subd_flags |= SDF_CMD; if (s->insn_bits != &insn_inval) us->insn_bits_support = COMEDI_SUPPORTED; else us->insn_bits_support = COMEDI_UNSUPPORTED; } ret = copy_to_user(arg, tmp, dev->n_subdevices * sizeof(*tmp)); kfree(tmp); return ret ? -EFAULT : 0; } /* * COMEDI_CHANINFO ioctl * subdevice channel info * * arg: * pointer to comedi_chaninfo structure * * reads: * comedi_chaninfo structure * * writes: * array of maxdata values to chaninfo->maxdata_list if requested * array of range table lengths to chaninfo->range_table_list if requested */ static int do_chaninfo_ioctl(struct comedi_device *dev, struct comedi_chaninfo *it) { struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (it->subdev >= dev->n_subdevices) return -EINVAL; s = &dev->subdevices[it->subdev]; if (it->maxdata_list) { if (s->maxdata || !s->maxdata_list) return -EINVAL; if (copy_to_user(it->maxdata_list, s->maxdata_list, s->n_chan * sizeof(unsigned int))) return -EFAULT; } if (it->flaglist) return -EINVAL; /* flaglist not supported */ if (it->rangelist) { int i; if (!s->range_table_list) return -EINVAL; for (i = 0; i < s->n_chan; i++) { int x; x = (dev->minor << 28) | (it->subdev << 24) | (i << 16) | (s->range_table_list[i]->length); if (put_user(x, it->rangelist + i)) return -EFAULT; } } return 0; } /* * COMEDI_BUFINFO ioctl * buffer information * * arg: * pointer to comedi_bufinfo structure * * reads: * comedi_bufinfo structure * * writes: * modified comedi_bufinfo structure */ static int do_bufinfo_ioctl(struct comedi_device *dev, struct comedi_bufinfo __user *arg, void *file) { struct comedi_bufinfo bi; struct comedi_subdevice *s; struct comedi_async *async; unsigned int runflags; int retval = 0; bool become_nonbusy = false; lockdep_assert_held(&dev->mutex); if (copy_from_user(&bi, arg, sizeof(bi))) return -EFAULT; if (bi.subdevice >= dev->n_subdevices) return -EINVAL; s = &dev->subdevices[bi.subdevice]; async = s->async; if (!async || s->busy != file) return -EINVAL; runflags = comedi_get_subdevice_runflags(s); if (!(async->cmd.flags & CMDF_WRITE)) { /* command was set up in "read" direction */ if (bi.bytes_read) { comedi_buf_read_alloc(s, bi.bytes_read); bi.bytes_read = comedi_buf_read_free(s, bi.bytes_read); } /* * If nothing left to read, and command has stopped, and * {"read" position not updated or command stopped normally}, * then become non-busy. */ if (comedi_buf_read_n_available(s) == 0 && !comedi_is_runflags_running(runflags) && (bi.bytes_read == 0 || !comedi_is_runflags_in_error(runflags))) { become_nonbusy = true; if (comedi_is_runflags_in_error(runflags)) retval = -EPIPE; } bi.bytes_written = 0; } else { /* command was set up in "write" direction */ if (!comedi_is_runflags_running(runflags)) { bi.bytes_written = 0; become_nonbusy = true; if (comedi_is_runflags_in_error(runflags)) retval = -EPIPE; } else if (bi.bytes_written) { comedi_buf_write_alloc(s, bi.bytes_written); bi.bytes_written = comedi_buf_write_free(s, bi.bytes_written); } bi.bytes_read = 0; } bi.buf_write_count = async->buf_write_count; bi.buf_write_ptr = async->buf_write_ptr; bi.buf_read_count = async->buf_read_count; bi.buf_read_ptr = async->buf_read_ptr; if (become_nonbusy) do_become_nonbusy(dev, s); if (retval) return retval; if (copy_to_user(arg, &bi, sizeof(bi))) return -EFAULT; return 0; } static int check_insn_config_length(struct comedi_insn *insn, unsigned int *data) { if (insn->n < 1) return -EINVAL; switch (data[0]) { case INSN_CONFIG_DIO_OUTPUT: case INSN_CONFIG_DIO_INPUT: case INSN_CONFIG_DISARM: case INSN_CONFIG_RESET: if (insn->n == 1) return 0; break; case INSN_CONFIG_ARM: case INSN_CONFIG_DIO_QUERY: case INSN_CONFIG_BLOCK_SIZE: case INSN_CONFIG_FILTER: case INSN_CONFIG_SERIAL_CLOCK: case INSN_CONFIG_BIDIRECTIONAL_DATA: case INSN_CONFIG_ALT_SOURCE: case INSN_CONFIG_SET_COUNTER_MODE: case INSN_CONFIG_8254_READ_STATUS: case INSN_CONFIG_SET_ROUTING: case INSN_CONFIG_GET_ROUTING: case INSN_CONFIG_GET_PWM_STATUS: case INSN_CONFIG_PWM_SET_PERIOD: case INSN_CONFIG_PWM_GET_PERIOD: if (insn->n == 2) return 0; break; case INSN_CONFIG_SET_GATE_SRC: case INSN_CONFIG_GET_GATE_SRC: case INSN_CONFIG_SET_CLOCK_SRC: case INSN_CONFIG_GET_CLOCK_SRC: case INSN_CONFIG_SET_OTHER_SRC: case INSN_CONFIG_GET_COUNTER_STATUS: case INSN_CONFIG_GET_PWM_OUTPUT: case INSN_CONFIG_PWM_SET_H_BRIDGE: case INSN_CONFIG_PWM_GET_H_BRIDGE: case INSN_CONFIG_GET_HARDWARE_BUFFER_SIZE: if (insn->n == 3) return 0; break; case INSN_CONFIG_PWM_OUTPUT: case INSN_CONFIG_ANALOG_TRIG: case INSN_CONFIG_TIMER_1: if (insn->n == 5) return 0; break; case INSN_CONFIG_DIGITAL_TRIG: if (insn->n == 6) return 0; break; case INSN_CONFIG_GET_CMD_TIMING_CONSTRAINTS: if (insn->n >= 4) return 0; break; /* * by default we allow the insn since we don't have checks for * all possible cases yet */ default: pr_warn("No check for data length of config insn id %i is implemented\n", data[0]); pr_warn("Add a check to %s in %s\n", __func__, __FILE__); pr_warn("Assuming n=%i is correct\n", insn->n); return 0; } return -EINVAL; } static int check_insn_device_config_length(struct comedi_insn *insn, unsigned int *data) { if (insn->n < 1) return -EINVAL; switch (data[0]) { case INSN_DEVICE_CONFIG_TEST_ROUTE: case INSN_DEVICE_CONFIG_CONNECT_ROUTE: case INSN_DEVICE_CONFIG_DISCONNECT_ROUTE: if (insn->n == 3) return 0; break; case INSN_DEVICE_CONFIG_GET_ROUTES: /* * Big enough for config_id and the length of the userland * memory buffer. Additional length should be in factors of 2 * to communicate any returned route pairs (source,destination). */ if (insn->n >= 2) return 0; break; } return -EINVAL; } /** * get_valid_routes() - Calls low-level driver get_valid_routes function to * either return a count of valid routes to user, or copy * of list of all valid device routes to buffer in * userspace. * @dev: comedi device pointer * @data: data from user insn call. The length of the data must be >= 2. * data[0] must contain the INSN_DEVICE_CONFIG config_id. * data[1](input) contains the number of _pairs_ for which memory is * allotted from the user. If the user specifies '0', then only * the number of pairs available is returned. * data[1](output) returns either the number of pairs available (if none * where requested) or the number of _pairs_ that are copied back * to the user. * data[2::2] returns each (source, destination) pair. * * Return: -EINVAL if low-level driver does not allocate and return routes as * expected. Returns 0 otherwise. */ static int get_valid_routes(struct comedi_device *dev, unsigned int *data) { lockdep_assert_held(&dev->mutex); data[1] = dev->get_valid_routes(dev, data[1], data + 2); return 0; } static int parse_insn(struct comedi_device *dev, struct comedi_insn *insn, unsigned int *data, void *file) { struct comedi_subdevice *s; int ret = 0; int i; lockdep_assert_held(&dev->mutex); if (insn->insn & INSN_MASK_SPECIAL) { /* a non-subdevice instruction */ switch (insn->insn) { case INSN_GTOD: { struct timespec64 tv; if (insn->n != 2) { ret = -EINVAL; break; } ktime_get_real_ts64(&tv); /* unsigned data safe until 2106 */ data[0] = (unsigned int)tv.tv_sec; data[1] = tv.tv_nsec / NSEC_PER_USEC; ret = 2; break; } case INSN_WAIT: if (insn->n != 1 || data[0] >= 100000) { ret = -EINVAL; break; } udelay(data[0] / 1000); ret = 1; break; case INSN_INTTRIG: if (insn->n != 1) { ret = -EINVAL; break; } if (insn->subdev >= dev->n_subdevices) { dev_dbg(dev->class_dev, "%d not usable subdevice\n", insn->subdev); ret = -EINVAL; break; } s = &dev->subdevices[insn->subdev]; if (!s->async) { dev_dbg(dev->class_dev, "no async\n"); ret = -EINVAL; break; } if (!s->async->inttrig) { dev_dbg(dev->class_dev, "no inttrig\n"); ret = -EAGAIN; break; } ret = s->async->inttrig(dev, s, data[0]); if (ret >= 0) ret = 1; break; case INSN_DEVICE_CONFIG: ret = check_insn_device_config_length(insn, data); if (ret) break; if (data[0] == INSN_DEVICE_CONFIG_GET_ROUTES) { /* * data[1] should be the number of _pairs_ that * the memory can hold. */ data[1] = (insn->n - 2) / 2; ret = get_valid_routes(dev, data); break; } /* other global device config instructions. */ ret = dev->insn_device_config(dev, insn, data); break; default: dev_dbg(dev->class_dev, "invalid insn\n"); ret = -EINVAL; break; } } else { /* a subdevice instruction */ unsigned int maxdata; if (insn->subdev >= dev->n_subdevices) { dev_dbg(dev->class_dev, "subdevice %d out of range\n", insn->subdev); ret = -EINVAL; goto out; } s = &dev->subdevices[insn->subdev]; if (s->type == COMEDI_SUBD_UNUSED) { dev_dbg(dev->class_dev, "%d not usable subdevice\n", insn->subdev); ret = -EIO; goto out; } /* are we locked? (ioctl lock) */ if (s->lock && s->lock != file) { dev_dbg(dev->class_dev, "device locked\n"); ret = -EACCES; goto out; } ret = comedi_check_chanlist(s, 1, &insn->chanspec); if (ret < 0) { ret = -EINVAL; dev_dbg(dev->class_dev, "bad chanspec\n"); goto out; } if (s->busy) { ret = -EBUSY; goto out; } /* This looks arbitrary. It is. */ s->busy = parse_insn; switch (insn->insn) { case INSN_READ: ret = s->insn_read(dev, s, insn, data); if (ret == -ETIMEDOUT) { dev_dbg(dev->class_dev, "subdevice %d read instruction timed out\n", s->index); } break; case INSN_WRITE: maxdata = s->maxdata_list ? s->maxdata_list[CR_CHAN(insn->chanspec)] : s->maxdata; for (i = 0; i < insn->n; ++i) { if (data[i] > maxdata) { ret = -EINVAL; dev_dbg(dev->class_dev, "bad data value(s)\n"); break; } } if (ret == 0) { ret = s->insn_write(dev, s, insn, data); if (ret == -ETIMEDOUT) { dev_dbg(dev->class_dev, "subdevice %d write instruction timed out\n", s->index); } } break; case INSN_BITS: if (insn->n != 2) { ret = -EINVAL; } else { /* * Most drivers ignore the base channel in * insn->chanspec. Fix this here if * the subdevice has <= 32 channels. */ unsigned int orig_mask = data[0]; unsigned int shift = 0; if (s->n_chan <= 32) { shift = CR_CHAN(insn->chanspec); if (shift > 0) { insn->chanspec = 0; data[0] <<= shift; data[1] <<= shift; } } ret = s->insn_bits(dev, s, insn, data); data[0] = orig_mask; if (shift > 0) data[1] >>= shift; } break; case INSN_CONFIG: ret = check_insn_config_length(insn, data); if (ret) break; ret = s->insn_config(dev, s, insn, data); break; default: ret = -EINVAL; break; } s->busy = NULL; } out: return ret; } /* * COMEDI_INSNLIST ioctl * synchronous instruction list * * arg: * pointer to comedi_insnlist structure * * reads: * comedi_insnlist structure * array of comedi_insn structures from insnlist->insns pointer * data (for writes) from insns[].data pointers * * writes: * data (for reads) to insns[].data pointers */ /* arbitrary limits */ #define MIN_SAMPLES 16 #define MAX_SAMPLES 65536 static int do_insnlist_ioctl(struct comedi_device *dev, struct comedi_insn *insns, unsigned int n_insns, void *file) { unsigned int *data = NULL; unsigned int max_n_data_required = MIN_SAMPLES; int i = 0; int ret = 0; lockdep_assert_held(&dev->mutex); /* Determine maximum memory needed for all instructions. */ for (i = 0; i < n_insns; ++i) { if (insns[i].n > MAX_SAMPLES) { dev_dbg(dev->class_dev, "number of samples too large\n"); ret = -EINVAL; goto error; } max_n_data_required = max(max_n_data_required, insns[i].n); } /* Allocate scratch space for all instruction data. */ data = kmalloc_array(max_n_data_required, sizeof(unsigned int), GFP_KERNEL); if (!data) { ret = -ENOMEM; goto error; } for (i = 0; i < n_insns; ++i) { if (insns[i].insn & INSN_MASK_WRITE) { if (copy_from_user(data, insns[i].data, insns[i].n * sizeof(unsigned int))) { dev_dbg(dev->class_dev, "copy_from_user failed\n"); ret = -EFAULT; goto error; } } ret = parse_insn(dev, insns + i, data, file); if (ret < 0) goto error; if (insns[i].insn & INSN_MASK_READ) { if (copy_to_user(insns[i].data, data, insns[i].n * sizeof(unsigned int))) { dev_dbg(dev->class_dev, "copy_to_user failed\n"); ret = -EFAULT; goto error; } } if (need_resched()) schedule(); } error: kfree(data); if (ret < 0) return ret; return i; } /* * COMEDI_INSN ioctl * synchronous instruction * * arg: * pointer to comedi_insn structure * * reads: * comedi_insn structure * data (for writes) from insn->data pointer * * writes: * data (for reads) to insn->data pointer */ static int do_insn_ioctl(struct comedi_device *dev, struct comedi_insn *insn, void *file) { unsigned int *data = NULL; unsigned int n_data = MIN_SAMPLES; int ret = 0; lockdep_assert_held(&dev->mutex); n_data = max(n_data, insn->n); /* This is where the behavior of insn and insnlist deviate. */ if (insn->n > MAX_SAMPLES) { insn->n = MAX_SAMPLES; n_data = MAX_SAMPLES; } data = kmalloc_array(n_data, sizeof(unsigned int), GFP_KERNEL); if (!data) { ret = -ENOMEM; goto error; } if (insn->insn & INSN_MASK_WRITE) { if (copy_from_user(data, insn->data, insn->n * sizeof(unsigned int))) { ret = -EFAULT; goto error; } } ret = parse_insn(dev, insn, data, file); if (ret < 0) goto error; if (insn->insn & INSN_MASK_READ) { if (copy_to_user(insn->data, data, insn->n * sizeof(unsigned int))) { ret = -EFAULT; goto error; } } ret = insn->n; error: kfree(data); return ret; } static int __comedi_get_user_cmd(struct comedi_device *dev, struct comedi_cmd *cmd) { struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (cmd->subdev >= dev->n_subdevices) { dev_dbg(dev->class_dev, "%d no such subdevice\n", cmd->subdev); return -ENODEV; } s = &dev->subdevices[cmd->subdev]; if (s->type == COMEDI_SUBD_UNUSED) { dev_dbg(dev->class_dev, "%d not valid subdevice\n", cmd->subdev); return -EIO; } if (!s->do_cmd || !s->do_cmdtest || !s->async) { dev_dbg(dev->class_dev, "subdevice %d does not support commands\n", cmd->subdev); return -EIO; } /* make sure channel/gain list isn't too long */ if (cmd->chanlist_len > s->len_chanlist) { dev_dbg(dev->class_dev, "channel/gain list too long %d > %d\n", cmd->chanlist_len, s->len_chanlist); return -EINVAL; } /* * Set the CMDF_WRITE flag to the correct state if the subdevice * supports only "read" commands or only "write" commands. */ switch (s->subdev_flags & (SDF_CMD_READ | SDF_CMD_WRITE)) { case SDF_CMD_READ: cmd->flags &= ~CMDF_WRITE; break; case SDF_CMD_WRITE: cmd->flags |= CMDF_WRITE; break; default: break; } return 0; } static int __comedi_get_user_chanlist(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int __user *user_chanlist, struct comedi_cmd *cmd) { unsigned int *chanlist; int ret; lockdep_assert_held(&dev->mutex); cmd->chanlist = NULL; chanlist = memdup_array_user(user_chanlist, cmd->chanlist_len, sizeof(unsigned int)); if (IS_ERR(chanlist)) return PTR_ERR(chanlist); /* make sure each element in channel/gain list is valid */ ret = comedi_check_chanlist(s, cmd->chanlist_len, chanlist); if (ret < 0) { kfree(chanlist); return ret; } cmd->chanlist = chanlist; return 0; } /* * COMEDI_CMD ioctl * asynchronous acquisition command set-up * * arg: * pointer to comedi_cmd structure * * reads: * comedi_cmd structure * channel/range list from cmd->chanlist pointer * * writes: * possibly modified comedi_cmd structure (when -EAGAIN returned) */ static int do_cmd_ioctl(struct comedi_device *dev, struct comedi_cmd *cmd, bool *copy, void *file) { struct comedi_subdevice *s; struct comedi_async *async; unsigned int __user *user_chanlist; int ret; lockdep_assert_held(&dev->mutex); /* do some simple cmd validation */ ret = __comedi_get_user_cmd(dev, cmd); if (ret) return ret; /* save user's chanlist pointer so it can be restored later */ user_chanlist = (unsigned int __user *)cmd->chanlist; s = &dev->subdevices[cmd->subdev]; async = s->async; /* are we locked? (ioctl lock) */ if (s->lock && s->lock != file) { dev_dbg(dev->class_dev, "subdevice locked\n"); return -EACCES; } /* are we busy? */ if (s->busy) { dev_dbg(dev->class_dev, "subdevice busy\n"); return -EBUSY; } /* make sure channel/gain list isn't too short */ if (cmd->chanlist_len < 1) { dev_dbg(dev->class_dev, "channel/gain list too short %u < 1\n", cmd->chanlist_len); return -EINVAL; } async->cmd = *cmd; async->cmd.data = NULL; /* load channel/gain list */ ret = __comedi_get_user_chanlist(dev, s, user_chanlist, &async->cmd); if (ret) goto cleanup; ret = s->do_cmdtest(dev, s, &async->cmd); if (async->cmd.flags & CMDF_BOGUS || ret) { dev_dbg(dev->class_dev, "test returned %d\n", ret); *cmd = async->cmd; /* restore chanlist pointer before copying back */ cmd->chanlist = (unsigned int __force *)user_chanlist; cmd->data = NULL; *copy = true; ret = -EAGAIN; goto cleanup; } if (!async->prealloc_bufsz) { ret = -ENOMEM; dev_dbg(dev->class_dev, "no buffer (?)\n"); goto cleanup; } comedi_buf_reset(s); async->cb_mask = COMEDI_CB_BLOCK | COMEDI_CB_CANCEL_MASK; if (async->cmd.flags & CMDF_WAKE_EOS) async->cb_mask |= COMEDI_CB_EOS; comedi_update_subdevice_runflags(s, COMEDI_SRF_BUSY_MASK, COMEDI_SRF_RUNNING); /* * Set s->busy _after_ setting COMEDI_SRF_RUNNING flag to avoid * race with comedi_read() or comedi_write(). */ s->busy = file; ret = s->do_cmd(dev, s); if (ret == 0) return 0; cleanup: do_become_nonbusy(dev, s); return ret; } /* * COMEDI_CMDTEST ioctl * asynchronous acquisition command testing * * arg: * pointer to comedi_cmd structure * * reads: * comedi_cmd structure * channel/range list from cmd->chanlist pointer * * writes: * possibly modified comedi_cmd structure */ static int do_cmdtest_ioctl(struct comedi_device *dev, struct comedi_cmd *cmd, bool *copy, void *file) { struct comedi_subdevice *s; unsigned int __user *user_chanlist; int ret; lockdep_assert_held(&dev->mutex); /* do some simple cmd validation */ ret = __comedi_get_user_cmd(dev, cmd); if (ret) return ret; /* save user's chanlist pointer so it can be restored later */ user_chanlist = (unsigned int __user *)cmd->chanlist; s = &dev->subdevices[cmd->subdev]; /* user_chanlist can be NULL for COMEDI_CMDTEST ioctl */ if (user_chanlist) { /* load channel/gain list */ ret = __comedi_get_user_chanlist(dev, s, user_chanlist, cmd); if (ret) return ret; } ret = s->do_cmdtest(dev, s, cmd); kfree(cmd->chanlist); /* free kernel copy of user chanlist */ /* restore chanlist pointer before copying back */ cmd->chanlist = (unsigned int __force *)user_chanlist; *copy = true; return ret; } /* * COMEDI_LOCK ioctl * lock subdevice * * arg: * subdevice number * * reads: * nothing * * writes: * nothing */ static int do_lock_ioctl(struct comedi_device *dev, unsigned long arg, void *file) { int ret = 0; unsigned long flags; struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (arg >= dev->n_subdevices) return -EINVAL; s = &dev->subdevices[arg]; spin_lock_irqsave(&s->spin_lock, flags); if (s->busy || s->lock) ret = -EBUSY; else s->lock = file; spin_unlock_irqrestore(&s->spin_lock, flags); return ret; } /* * COMEDI_UNLOCK ioctl * unlock subdevice * * arg: * subdevice number * * reads: * nothing * * writes: * nothing */ static int do_unlock_ioctl(struct comedi_device *dev, unsigned long arg, void *file) { struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (arg >= dev->n_subdevices) return -EINVAL; s = &dev->subdevices[arg]; if (s->busy) return -EBUSY; if (s->lock && s->lock != file) return -EACCES; if (s->lock == file) s->lock = NULL; return 0; } /* * COMEDI_CANCEL ioctl * cancel asynchronous acquisition * * arg: * subdevice number * * reads: * nothing * * writes: * nothing */ static int do_cancel_ioctl(struct comedi_device *dev, unsigned long arg, void *file) { struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (arg >= dev->n_subdevices) return -EINVAL; s = &dev->subdevices[arg]; if (!s->async) return -EINVAL; if (!s->busy) return 0; if (s->busy != file) return -EBUSY; return do_cancel(dev, s); } /* * COMEDI_POLL ioctl * instructs driver to synchronize buffers * * arg: * subdevice number * * reads: * nothing * * writes: * nothing */ static int do_poll_ioctl(struct comedi_device *dev, unsigned long arg, void *file) { struct comedi_subdevice *s; lockdep_assert_held(&dev->mutex); if (arg >= dev->n_subdevices) return -EINVAL; s = &dev->subdevices[arg]; if (!s->busy) return 0; if (s->busy != file) return -EBUSY; if (s->poll) return s->poll(dev, s); return -EINVAL; } /* * COMEDI_SETRSUBD ioctl * sets the current "read" subdevice on a per-file basis * * arg: * subdevice number * * reads: * nothing * * writes: * nothing */ static int do_setrsubd_ioctl(struct comedi_device *dev, unsigned long arg, struct file *file) { struct comedi_file *cfp = file->private_data; struct comedi_subdevice *s_old, *s_new; lockdep_assert_held(&dev->mutex); if (arg >= dev->n_subdevices) return -EINVAL; s_new = &dev->subdevices[arg]; s_old = comedi_file_read_subdevice(file); if (s_old == s_new) return 0; /* no change */ if (!(s_new->subdev_flags & SDF_CMD_READ)) return -EINVAL; /* * Check the file isn't still busy handling a "read" command on the * old subdevice (if any). */ if (s_old && s_old->busy == file && s_old->async && !(s_old->async->cmd.flags & CMDF_WRITE)) return -EBUSY; WRITE_ONCE(cfp->read_subdev, s_new); return 0; } /* * COMEDI_SETWSUBD ioctl * sets the current "write" subdevice on a per-file basis * * arg: * subdevice number * * reads: * nothing * * writes: * nothing */ static int do_setwsubd_ioctl(struct comedi_device *dev, unsigned long arg, struct file *file) { struct comedi_file *cfp = file->private_data; struct comedi_subdevice *s_old, *s_new; lockdep_assert_held(&dev->mutex); if (arg >= dev->n_subdevices) return -EINVAL; s_new = &dev->subdevices[arg]; s_old = comedi_file_write_subdevice(file); if (s_old == s_new) return 0; /* no change */ if (!(s_new->subdev_flags & SDF_CMD_WRITE)) return -EINVAL; /* * Check the file isn't still busy handling a "write" command on the * old subdevice (if any). */ if (s_old && s_old->busy == file && s_old->async && (s_old->async->cmd.flags & CMDF_WRITE)) return -EBUSY; WRITE_ONCE(cfp->write_subdev, s_new); return 0; } static long comedi_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { unsigned int minor = iminor(file_inode(file)); struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; int rc; mutex_lock(&dev->mutex); /* * Device config is special, because it must work on * an unconfigured device. */ if (cmd == COMEDI_DEVCONFIG) { if (minor >= COMEDI_NUM_BOARD_MINORS) { /* Device config not appropriate on non-board minors. */ rc = -ENOTTY; goto done; } rc = do_devconfig_ioctl(dev, (struct comedi_devconfig __user *)arg); if (rc == 0) { if (arg == 0 && dev->minor >= comedi_num_legacy_minors) { /* * Successfully unconfigured a dynamically * allocated device. Try and remove it. */ if (comedi_clear_board_dev(dev)) { mutex_unlock(&dev->mutex); comedi_free_board_dev(dev); return rc; } } } goto done; } if (!dev->attached) { dev_dbg(dev->class_dev, "no driver attached\n"); rc = -ENODEV; goto done; } switch (cmd) { case COMEDI_BUFCONFIG: rc = do_bufconfig_ioctl(dev, (struct comedi_bufconfig __user *)arg); break; case COMEDI_DEVINFO: rc = do_devinfo_ioctl(dev, (struct comedi_devinfo __user *)arg, file); break; case COMEDI_SUBDINFO: rc = do_subdinfo_ioctl(dev, (struct comedi_subdinfo __user *)arg, file); break; case COMEDI_CHANINFO: { struct comedi_chaninfo it; if (copy_from_user(&it, (void __user *)arg, sizeof(it))) rc = -EFAULT; else rc = do_chaninfo_ioctl(dev, &it); break; } case COMEDI_RANGEINFO: { struct comedi_rangeinfo it; if (copy_from_user(&it, (void __user *)arg, sizeof(it))) rc = -EFAULT; else rc = do_rangeinfo_ioctl(dev, &it); break; } case COMEDI_BUFINFO: rc = do_bufinfo_ioctl(dev, (struct comedi_bufinfo __user *)arg, file); break; case COMEDI_LOCK: rc = do_lock_ioctl(dev, arg, file); break; case COMEDI_UNLOCK: rc = do_unlock_ioctl(dev, arg, file); break; case COMEDI_CANCEL: rc = do_cancel_ioctl(dev, arg, file); break; case COMEDI_CMD: { struct comedi_cmd cmd; bool copy = false; if (copy_from_user(&cmd, (void __user *)arg, sizeof(cmd))) { rc = -EFAULT; break; } rc = do_cmd_ioctl(dev, &cmd, &copy, file); if (copy && copy_to_user((void __user *)arg, &cmd, sizeof(cmd))) rc = -EFAULT; break; } case COMEDI_CMDTEST: { struct comedi_cmd cmd; bool copy = false; if (copy_from_user(&cmd, (void __user *)arg, sizeof(cmd))) { rc = -EFAULT; break; } rc = do_cmdtest_ioctl(dev, &cmd, &copy, file); if (copy && copy_to_user((void __user *)arg, &cmd, sizeof(cmd))) rc = -EFAULT; break; } case COMEDI_INSNLIST: { struct comedi_insnlist insnlist; struct comedi_insn *insns = NULL; if (copy_from_user(&insnlist, (void __user *)arg, sizeof(insnlist))) { rc = -EFAULT; break; } insns = kcalloc(insnlist.n_insns, sizeof(*insns), GFP_KERNEL); if (!insns) { rc = -ENOMEM; break; } if (copy_from_user(insns, insnlist.insns, sizeof(*insns) * insnlist.n_insns)) { rc = -EFAULT; kfree(insns); break; } rc = do_insnlist_ioctl(dev, insns, insnlist.n_insns, file); kfree(insns); break; } case COMEDI_INSN: { struct comedi_insn insn; if (copy_from_user(&insn, (void __user *)arg, sizeof(insn))) rc = -EFAULT; else rc = do_insn_ioctl(dev, &insn, file); break; } case COMEDI_POLL: rc = do_poll_ioctl(dev, arg, file); break; case COMEDI_SETRSUBD: rc = do_setrsubd_ioctl(dev, arg, file); break; case COMEDI_SETWSUBD: rc = do_setwsubd_ioctl(dev, arg, file); break; default: rc = -ENOTTY; break; } done: mutex_unlock(&dev->mutex); return rc; } static void comedi_vm_open(struct vm_area_struct *area) { struct comedi_buf_map *bm; bm = area->vm_private_data; comedi_buf_map_get(bm); } static void comedi_vm_close(struct vm_area_struct *area) { struct comedi_buf_map *bm; bm = area->vm_private_data; comedi_buf_map_put(bm); } static int comedi_vm_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct comedi_buf_map *bm = vma->vm_private_data; unsigned long offset = addr - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT); if (len < 0) return -EINVAL; if (len > vma->vm_end - addr) len = vma->vm_end - addr; return comedi_buf_map_access(bm, offset, buf, len, write); } static const struct vm_operations_struct comedi_vm_ops = { .open = comedi_vm_open, .close = comedi_vm_close, .access = comedi_vm_access, }; static int comedi_mmap(struct file *file, struct vm_area_struct *vma) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi_subdevice *s; struct comedi_async *async; struct comedi_buf_map *bm = NULL; struct comedi_buf_page *buf; unsigned long start = vma->vm_start; unsigned long size; int n_pages; int i; int retval = 0; /* * 'trylock' avoids circular dependency with current->mm->mmap_lock * and down-reading &dev->attach_lock should normally succeed without * contention unless the device is in the process of being attached * or detached. */ if (!down_read_trylock(&dev->attach_lock)) return -EAGAIN; if (!dev->attached) { dev_dbg(dev->class_dev, "no driver attached\n"); retval = -ENODEV; goto done; } if (vma->vm_flags & VM_WRITE) s = comedi_file_write_subdevice(file); else s = comedi_file_read_subdevice(file); if (!s) { retval = -EINVAL; goto done; } async = s->async; if (!async) { retval = -EINVAL; goto done; } if (vma->vm_pgoff != 0) { dev_dbg(dev->class_dev, "mmap() offset must be 0.\n"); retval = -EINVAL; goto done; } size = vma->vm_end - vma->vm_start; if (size > async->prealloc_bufsz) { retval = -EFAULT; goto done; } if (offset_in_page(size)) { retval = -EFAULT; goto done; } n_pages = vma_pages(vma); /* get reference to current buf map (if any) */ bm = comedi_buf_map_from_subdev_get(s); if (!bm || n_pages > bm->n_pages) { retval = -EINVAL; goto done; } if (bm->dma_dir != DMA_NONE) { /* * DMA buffer was allocated as a single block. * Address is in page_list[0]. */ buf = &bm->page_list[0]; retval = dma_mmap_coherent(bm->dma_hw_dev, vma, buf->virt_addr, buf->dma_addr, n_pages * PAGE_SIZE); } else { for (i = 0; i < n_pages; ++i) { unsigned long pfn; buf = &bm->page_list[i]; pfn = page_to_pfn(virt_to_page(buf->virt_addr)); retval = remap_pfn_range(vma, start, pfn, PAGE_SIZE, PAGE_SHARED); if (retval) break; start += PAGE_SIZE; } #ifdef CONFIG_MMU /* * Leaving behind a partial mapping of a buffer we're about to * drop is unsafe, see remap_pfn_range_notrack(). * We need to zap the range here ourselves instead of relying * on the automatic zapping in remap_pfn_range() because we call * remap_pfn_range() in a loop. */ if (retval) zap_vma_ptes(vma, vma->vm_start, size); #endif } if (retval == 0) { vma->vm_ops = &comedi_vm_ops; vma->vm_private_data = bm; vma->vm_ops->open(vma); } done: up_read(&dev->attach_lock); comedi_buf_map_put(bm); /* put reference to buf map - okay if NULL */ return retval; } static __poll_t comedi_poll(struct file *file, poll_table *wait) { __poll_t mask = 0; struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi_subdevice *s, *s_read; down_read(&dev->attach_lock); if (!dev->attached) { dev_dbg(dev->class_dev, "no driver attached\n"); goto done; } s = comedi_file_read_subdevice(file); s_read = s; if (s && s->async) { poll_wait(file, &s->async->wait_head, wait); if (s->busy != file || !comedi_is_subdevice_running(s) || (s->async->cmd.flags & CMDF_WRITE) || comedi_buf_read_n_available(s) > 0) mask |= EPOLLIN | EPOLLRDNORM; } s = comedi_file_write_subdevice(file); if (s && s->async) { unsigned int bps = comedi_bytes_per_sample(s); if (s != s_read) poll_wait(file, &s->async->wait_head, wait); if (s->busy != file || !comedi_is_subdevice_running(s) || !(s->async->cmd.flags & CMDF_WRITE) || comedi_buf_write_n_available(s) >= bps) mask |= EPOLLOUT | EPOLLWRNORM; } done: up_read(&dev->attach_lock); return mask; } static ssize_t comedi_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *offset) { struct comedi_subdevice *s; struct comedi_async *async; unsigned int n, m; ssize_t count = 0; int retval = 0; DECLARE_WAITQUEUE(wait, current); struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; bool become_nonbusy = false; bool attach_locked; unsigned int old_detach_count; /* Protect against device detachment during operation. */ down_read(&dev->attach_lock); attach_locked = true; old_detach_count = dev->detach_count; if (!dev->attached) { dev_dbg(dev->class_dev, "no driver attached\n"); retval = -ENODEV; goto out; } s = comedi_file_write_subdevice(file); if (!s || !s->async) { retval = -EIO; goto out; } async = s->async; if (s->busy != file || !(async->cmd.flags & CMDF_WRITE)) { retval = -EINVAL; goto out; } add_wait_queue(&async->wait_head, &wait); while (count == 0 && !retval) { unsigned int runflags; unsigned int wp, n1, n2; set_current_state(TASK_INTERRUPTIBLE); runflags = comedi_get_subdevice_runflags(s); if (!comedi_is_runflags_running(runflags)) { if (comedi_is_runflags_in_error(runflags)) retval = -EPIPE; if (retval || nbytes) become_nonbusy = true; break; } if (nbytes == 0) break; /* Allocate all free buffer space. */ comedi_buf_write_alloc(s, async->prealloc_bufsz); m = comedi_buf_write_n_allocated(s); n = min_t(size_t, m, nbytes); if (n == 0) { if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; break; } schedule(); if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (s->busy != file || !(async->cmd.flags & CMDF_WRITE)) { retval = -EINVAL; break; } continue; } set_current_state(TASK_RUNNING); wp = async->buf_write_ptr; n1 = min(n, async->prealloc_bufsz - wp); n2 = n - n1; m = copy_from_user(async->prealloc_buf + wp, buf, n1); if (m) m += n2; else if (n2) m = copy_from_user(async->prealloc_buf, buf + n1, n2); if (m) { n -= m; retval = -EFAULT; } comedi_buf_write_free(s, n); count += n; nbytes -= n; buf += n; } remove_wait_queue(&async->wait_head, &wait); set_current_state(TASK_RUNNING); if (become_nonbusy && count == 0) { struct comedi_subdevice *new_s; /* * To avoid deadlock, cannot acquire dev->mutex * while dev->attach_lock is held. */ up_read(&dev->attach_lock); attach_locked = false; mutex_lock(&dev->mutex); /* * Check device hasn't become detached behind our back. * Checking dev->detach_count is unchanged ought to be * sufficient (unless there have been 2**32 detaches in the * meantime!), but check the subdevice pointer as well just in * case. * * Also check the subdevice is still in a suitable state to * become non-busy in case it changed behind our back. */ new_s = comedi_file_write_subdevice(file); if (dev->attached && old_detach_count == dev->detach_count && s == new_s && new_s->async == async && s->busy == file && (async->cmd.flags & CMDF_WRITE) && !comedi_is_subdevice_running(s)) do_become_nonbusy(dev, s); mutex_unlock(&dev->mutex); } out: if (attach_locked) up_read(&dev->attach_lock); return count ? count : retval; } static ssize_t comedi_read(struct file *file, char __user *buf, size_t nbytes, loff_t *offset) { struct comedi_subdevice *s; struct comedi_async *async; unsigned int n, m; ssize_t count = 0; int retval = 0; DECLARE_WAITQUEUE(wait, current); struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; unsigned int old_detach_count; bool become_nonbusy = false; bool attach_locked; /* Protect against device detachment during operation. */ down_read(&dev->attach_lock); attach_locked = true; old_detach_count = dev->detach_count; if (!dev->attached) { dev_dbg(dev->class_dev, "no driver attached\n"); retval = -ENODEV; goto out; } s = comedi_file_read_subdevice(file); if (!s || !s->async) { retval = -EIO; goto out; } async = s->async; if (s->busy != file || (async->cmd.flags & CMDF_WRITE)) { retval = -EINVAL; goto out; } add_wait_queue(&async->wait_head, &wait); while (count == 0 && !retval) { unsigned int rp, n1, n2; set_current_state(TASK_INTERRUPTIBLE); m = comedi_buf_read_n_available(s); n = min_t(size_t, m, nbytes); if (n == 0) { unsigned int runflags = comedi_get_subdevice_runflags(s); if (!comedi_is_runflags_running(runflags)) { if (comedi_is_runflags_in_error(runflags)) retval = -EPIPE; if (retval || nbytes) become_nonbusy = true; break; } if (nbytes == 0) break; if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; break; } schedule(); if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (s->busy != file || (async->cmd.flags & CMDF_WRITE)) { retval = -EINVAL; break; } continue; } set_current_state(TASK_RUNNING); rp = async->buf_read_ptr; n1 = min(n, async->prealloc_bufsz - rp); n2 = n - n1; m = copy_to_user(buf, async->prealloc_buf + rp, n1); if (m) m += n2; else if (n2) m = copy_to_user(buf + n1, async->prealloc_buf, n2); if (m) { n -= m; retval = -EFAULT; } comedi_buf_read_alloc(s, n); comedi_buf_read_free(s, n); count += n; nbytes -= n; buf += n; } remove_wait_queue(&async->wait_head, &wait); set_current_state(TASK_RUNNING); if (become_nonbusy && count == 0) { struct comedi_subdevice *new_s; /* * To avoid deadlock, cannot acquire dev->mutex * while dev->attach_lock is held. */ up_read(&dev->attach_lock); attach_locked = false; mutex_lock(&dev->mutex); /* * Check device hasn't become detached behind our back. * Checking dev->detach_count is unchanged ought to be * sufficient (unless there have been 2**32 detaches in the * meantime!), but check the subdevice pointer as well just in * case. * * Also check the subdevice is still in a suitable state to * become non-busy in case it changed behind our back. */ new_s = comedi_file_read_subdevice(file); if (dev->attached && old_detach_count == dev->detach_count && s == new_s && new_s->async == async && s->busy == file && !(async->cmd.flags & CMDF_WRITE) && !comedi_is_subdevice_running(s) && comedi_buf_read_n_available(s) == 0) do_become_nonbusy(dev, s); mutex_unlock(&dev->mutex); } out: if (attach_locked) up_read(&dev->attach_lock); return count ? count : retval; } static int comedi_open(struct inode *inode, struct file *file) { const unsigned int minor = iminor(inode); struct comedi_file *cfp; struct comedi_device *dev = comedi_dev_get_from_minor(minor); int rc; if (!dev) { pr_debug("invalid minor number\n"); return -ENODEV; } cfp = kzalloc(sizeof(*cfp), GFP_KERNEL); if (!cfp) { comedi_dev_put(dev); return -ENOMEM; } cfp->dev = dev; mutex_lock(&dev->mutex); if (!dev->attached && !capable(CAP_SYS_ADMIN)) { dev_dbg(dev->class_dev, "not attached and not CAP_SYS_ADMIN\n"); rc = -ENODEV; goto out; } if (dev->attached && dev->use_count == 0) { if (!try_module_get(dev->driver->module)) { rc = -ENXIO; goto out; } if (dev->open) { rc = dev->open(dev); if (rc < 0) { module_put(dev->driver->module); goto out; } } } dev->use_count++; file->private_data = cfp; comedi_file_reset(file); rc = 0; out: mutex_unlock(&dev->mutex); if (rc) { comedi_dev_put(dev); kfree(cfp); } return rc; } static int comedi_fasync(int fd, struct file *file, int on) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; return fasync_helper(fd, file, on, &dev->async_queue); } static int comedi_close(struct inode *inode, struct file *file) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi_subdevice *s = NULL; int i; mutex_lock(&dev->mutex); if (dev->subdevices) { for (i = 0; i < dev->n_subdevices; i++) { s = &dev->subdevices[i]; if (s->busy == file) do_cancel(dev, s); if (s->lock == file) s->lock = NULL; } } if (dev->attached && dev->use_count == 1) { if (dev->close) dev->close(dev); module_put(dev->driver->module); } dev->use_count--; mutex_unlock(&dev->mutex); comedi_dev_put(dev); kfree(cfp); return 0; } #ifdef CONFIG_COMPAT #define COMEDI32_CHANINFO _IOR(CIO, 3, struct comedi32_chaninfo_struct) #define COMEDI32_RANGEINFO _IOR(CIO, 8, struct comedi32_rangeinfo_struct) /* * N.B. COMEDI32_CMD and COMEDI_CMD ought to use _IOWR, not _IOR. * It's too late to change it now, but it only affects the command number. */ #define COMEDI32_CMD _IOR(CIO, 9, struct comedi32_cmd_struct) /* * N.B. COMEDI32_CMDTEST and COMEDI_CMDTEST ought to use _IOWR, not _IOR. * It's too late to change it now, but it only affects the command number. */ #define COMEDI32_CMDTEST _IOR(CIO, 10, struct comedi32_cmd_struct) #define COMEDI32_INSNLIST _IOR(CIO, 11, struct comedi32_insnlist_struct) #define COMEDI32_INSN _IOR(CIO, 12, struct comedi32_insn_struct) struct comedi32_chaninfo_struct { unsigned int subdev; compat_uptr_t maxdata_list; /* 32-bit 'unsigned int *' */ compat_uptr_t flaglist; /* 32-bit 'unsigned int *' */ compat_uptr_t rangelist; /* 32-bit 'unsigned int *' */ unsigned int unused[4]; }; struct comedi32_rangeinfo_struct { unsigned int range_type; compat_uptr_t range_ptr; /* 32-bit 'void *' */ }; struct comedi32_cmd_struct { unsigned int subdev; unsigned int flags; unsigned int start_src; unsigned int start_arg; unsigned int scan_begin_src; unsigned int scan_begin_arg; unsigned int convert_src; unsigned int convert_arg; unsigned int scan_end_src; unsigned int scan_end_arg; unsigned int stop_src; unsigned int stop_arg; compat_uptr_t chanlist; /* 32-bit 'unsigned int *' */ unsigned int chanlist_len; compat_uptr_t data; /* 32-bit 'short *' */ unsigned int data_len; }; struct comedi32_insn_struct { unsigned int insn; unsigned int n; compat_uptr_t data; /* 32-bit 'unsigned int *' */ unsigned int subdev; unsigned int chanspec; unsigned int unused[3]; }; struct comedi32_insnlist_struct { unsigned int n_insns; compat_uptr_t insns; /* 32-bit 'struct comedi_insn *' */ }; /* Handle 32-bit COMEDI_CHANINFO ioctl. */ static int compat_chaninfo(struct file *file, unsigned long arg) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi32_chaninfo_struct chaninfo32; struct comedi_chaninfo chaninfo; int err; if (copy_from_user(&chaninfo32, compat_ptr(arg), sizeof(chaninfo32))) return -EFAULT; memset(&chaninfo, 0, sizeof(chaninfo)); chaninfo.subdev = chaninfo32.subdev; chaninfo.maxdata_list = compat_ptr(chaninfo32.maxdata_list); chaninfo.flaglist = compat_ptr(chaninfo32.flaglist); chaninfo.rangelist = compat_ptr(chaninfo32.rangelist); mutex_lock(&dev->mutex); err = do_chaninfo_ioctl(dev, &chaninfo); mutex_unlock(&dev->mutex); return err; } /* Handle 32-bit COMEDI_RANGEINFO ioctl. */ static int compat_rangeinfo(struct file *file, unsigned long arg) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi32_rangeinfo_struct rangeinfo32; struct comedi_rangeinfo rangeinfo; int err; if (copy_from_user(&rangeinfo32, compat_ptr(arg), sizeof(rangeinfo32))) return -EFAULT; memset(&rangeinfo, 0, sizeof(rangeinfo)); rangeinfo.range_type = rangeinfo32.range_type; rangeinfo.range_ptr = compat_ptr(rangeinfo32.range_ptr); mutex_lock(&dev->mutex); err = do_rangeinfo_ioctl(dev, &rangeinfo); mutex_unlock(&dev->mutex); return err; } /* Copy 32-bit cmd structure to native cmd structure. */ static int get_compat_cmd(struct comedi_cmd *cmd, struct comedi32_cmd_struct __user *cmd32) { struct comedi32_cmd_struct v32; if (copy_from_user(&v32, cmd32, sizeof(v32))) return -EFAULT; cmd->subdev = v32.subdev; cmd->flags = v32.flags; cmd->start_src = v32.start_src; cmd->start_arg = v32.start_arg; cmd->scan_begin_src = v32.scan_begin_src; cmd->scan_begin_arg = v32.scan_begin_arg; cmd->convert_src = v32.convert_src; cmd->convert_arg = v32.convert_arg; cmd->scan_end_src = v32.scan_end_src; cmd->scan_end_arg = v32.scan_end_arg; cmd->stop_src = v32.stop_src; cmd->stop_arg = v32.stop_arg; cmd->chanlist = (unsigned int __force *)compat_ptr(v32.chanlist); cmd->chanlist_len = v32.chanlist_len; cmd->data = compat_ptr(v32.data); cmd->data_len = v32.data_len; return 0; } /* Copy native cmd structure to 32-bit cmd structure. */ static int put_compat_cmd(struct comedi32_cmd_struct __user *cmd32, struct comedi_cmd *cmd) { struct comedi32_cmd_struct v32; memset(&v32, 0, sizeof(v32)); v32.subdev = cmd->subdev; v32.flags = cmd->flags; v32.start_src = cmd->start_src; v32.start_arg = cmd->start_arg; v32.scan_begin_src = cmd->scan_begin_src; v32.scan_begin_arg = cmd->scan_begin_arg; v32.convert_src = cmd->convert_src; v32.convert_arg = cmd->convert_arg; v32.scan_end_src = cmd->scan_end_src; v32.scan_end_arg = cmd->scan_end_arg; v32.stop_src = cmd->stop_src; v32.stop_arg = cmd->stop_arg; /* Assume chanlist pointer is unchanged. */ v32.chanlist = ptr_to_compat((unsigned int __user *)cmd->chanlist); v32.chanlist_len = cmd->chanlist_len; v32.data = ptr_to_compat(cmd->data); v32.data_len = cmd->data_len; if (copy_to_user(cmd32, &v32, sizeof(v32))) return -EFAULT; return 0; } /* Handle 32-bit COMEDI_CMD ioctl. */ static int compat_cmd(struct file *file, unsigned long arg) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi_cmd cmd; bool copy = false; int rc, err; rc = get_compat_cmd(&cmd, compat_ptr(arg)); if (rc) return rc; mutex_lock(&dev->mutex); rc = do_cmd_ioctl(dev, &cmd, &copy, file); mutex_unlock(&dev->mutex); if (copy) { /* Special case: copy cmd back to user. */ err = put_compat_cmd(compat_ptr(arg), &cmd); if (err) rc = err; } return rc; } /* Handle 32-bit COMEDI_CMDTEST ioctl. */ static int compat_cmdtest(struct file *file, unsigned long arg) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi_cmd cmd; bool copy = false; int rc, err; rc = get_compat_cmd(&cmd, compat_ptr(arg)); if (rc) return rc; mutex_lock(&dev->mutex); rc = do_cmdtest_ioctl(dev, &cmd, &copy, file); mutex_unlock(&dev->mutex); if (copy) { err = put_compat_cmd(compat_ptr(arg), &cmd); if (err) rc = err; } return rc; } /* Copy 32-bit insn structure to native insn structure. */ static int get_compat_insn(struct comedi_insn *insn, struct comedi32_insn_struct __user *insn32) { struct comedi32_insn_struct v32; /* Copy insn structure. Ignore the unused members. */ if (copy_from_user(&v32, insn32, sizeof(v32))) return -EFAULT; memset(insn, 0, sizeof(*insn)); insn->insn = v32.insn; insn->n = v32.n; insn->data = compat_ptr(v32.data); insn->subdev = v32.subdev; insn->chanspec = v32.chanspec; return 0; } /* Handle 32-bit COMEDI_INSNLIST ioctl. */ static int compat_insnlist(struct file *file, unsigned long arg) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi32_insnlist_struct insnlist32; struct comedi32_insn_struct __user *insn32; struct comedi_insn *insns; unsigned int n; int rc; if (copy_from_user(&insnlist32, compat_ptr(arg), sizeof(insnlist32))) return -EFAULT; insns = kcalloc(insnlist32.n_insns, sizeof(*insns), GFP_KERNEL); if (!insns) return -ENOMEM; /* Copy insn structures. */ insn32 = compat_ptr(insnlist32.insns); for (n = 0; n < insnlist32.n_insns; n++) { rc = get_compat_insn(insns + n, insn32 + n); if (rc) { kfree(insns); return rc; } } mutex_lock(&dev->mutex); rc = do_insnlist_ioctl(dev, insns, insnlist32.n_insns, file); mutex_unlock(&dev->mutex); kfree(insns); return rc; } /* Handle 32-bit COMEDI_INSN ioctl. */ static int compat_insn(struct file *file, unsigned long arg) { struct comedi_file *cfp = file->private_data; struct comedi_device *dev = cfp->dev; struct comedi_insn insn; int rc; rc = get_compat_insn(&insn, (void __user *)arg); if (rc) return rc; mutex_lock(&dev->mutex); rc = do_insn_ioctl(dev, &insn, file); mutex_unlock(&dev->mutex); return rc; } /* * compat_ioctl file operation. * * Returns -ENOIOCTLCMD for unrecognised ioctl codes. */ static long comedi_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int rc; switch (cmd) { case COMEDI_DEVCONFIG: case COMEDI_DEVINFO: case COMEDI_SUBDINFO: case COMEDI_BUFCONFIG: case COMEDI_BUFINFO: /* Just need to translate the pointer argument. */ arg = (unsigned long)compat_ptr(arg); rc = comedi_unlocked_ioctl(file, cmd, arg); break; case COMEDI_LOCK: case COMEDI_UNLOCK: case COMEDI_CANCEL: case COMEDI_POLL: case COMEDI_SETRSUBD: case COMEDI_SETWSUBD: /* No translation needed. */ rc = comedi_unlocked_ioctl(file, cmd, arg); break; case COMEDI32_CHANINFO: rc = compat_chaninfo(file, arg); break; case COMEDI32_RANGEINFO: rc = compat_rangeinfo(file, arg); break; case COMEDI32_CMD: rc = compat_cmd(file, arg); break; case COMEDI32_CMDTEST: rc = compat_cmdtest(file, arg); break; case COMEDI32_INSNLIST: rc = compat_insnlist(file, arg); break; case COMEDI32_INSN: rc = compat_insn(file, arg); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #else #define comedi_compat_ioctl NULL #endif static const struct file_operations comedi_fops = { .owner = THIS_MODULE, .unlocked_ioctl = comedi_unlocked_ioctl, .compat_ioctl = comedi_compat_ioctl, .open = comedi_open, .release = comedi_close, .read = comedi_read, .write = comedi_write, .mmap = comedi_mmap, .poll = comedi_poll, .fasync = comedi_fasync, .llseek = noop_llseek, }; /** * comedi_event() - Handle events for asynchronous COMEDI command * @dev: COMEDI device. * @s: COMEDI subdevice. * Context: in_interrupt() (usually), @s->spin_lock spin-lock not held. * * If an asynchronous COMEDI command is active on the subdevice, process * any %COMEDI_CB_... event flags that have been set, usually by an * interrupt handler. These may change the run state of the asynchronous * command, wake a task, and/or send a %SIGIO signal. */ void comedi_event(struct comedi_device *dev, struct comedi_subdevice *s) { struct comedi_async *async = s->async; unsigned int events; int si_code = 0; unsigned long flags; spin_lock_irqsave(&s->spin_lock, flags); events = async->events; async->events = 0; if (!__comedi_is_subdevice_running(s)) { spin_unlock_irqrestore(&s->spin_lock, flags); return; } if (events & COMEDI_CB_CANCEL_MASK) __comedi_clear_subdevice_runflags(s, COMEDI_SRF_RUNNING); /* * Remember if an error event has occurred, so an error can be * returned the next time the user does a read() or write(). */ if (events & COMEDI_CB_ERROR_MASK) __comedi_set_subdevice_runflags(s, COMEDI_SRF_ERROR); if (async->cb_mask & events) { wake_up_interruptible(&async->wait_head); si_code = async->cmd.flags & CMDF_WRITE ? POLL_OUT : POLL_IN; } spin_unlock_irqrestore(&s->spin_lock, flags); if (si_code) kill_fasync(&dev->async_queue, SIGIO, si_code); } EXPORT_SYMBOL_GPL(comedi_event); /* Note: the ->mutex is pre-locked on successful return */ struct comedi_device *comedi_alloc_board_minor(struct device *hardware_device) { struct comedi_device *dev; struct device *csdev; unsigned int i; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); comedi_device_init(dev); comedi_set_hw_dev(dev, hardware_device); mutex_lock(&dev->mutex); mutex_lock(&comedi_board_minor_table_lock); for (i = hardware_device ? comedi_num_legacy_minors : 0; i < COMEDI_NUM_BOARD_MINORS; ++i) { if (!comedi_board_minor_table[i]) { comedi_board_minor_table[i] = dev; break; } } mutex_unlock(&comedi_board_minor_table_lock); if (i == COMEDI_NUM_BOARD_MINORS) { mutex_unlock(&dev->mutex); comedi_device_cleanup(dev); comedi_dev_put(dev); dev_err(hardware_device, "ran out of minor numbers for board device files\n"); return ERR_PTR(-EBUSY); } dev->minor = i; csdev = device_create(&comedi_class, hardware_device, MKDEV(COMEDI_MAJOR, i), NULL, "comedi%i", i); if (!IS_ERR(csdev)) dev->class_dev = get_device(csdev); /* Note: dev->mutex needs to be unlocked by the caller. */ return dev; } void comedi_release_hardware_device(struct device *hardware_device) { int minor; struct comedi_device *dev; for (minor = comedi_num_legacy_minors; minor < COMEDI_NUM_BOARD_MINORS; minor++) { mutex_lock(&comedi_board_minor_table_lock); dev = comedi_board_minor_table[minor]; if (dev && dev->hw_dev == hardware_device) { comedi_board_minor_table[minor] = NULL; mutex_unlock(&comedi_board_minor_table_lock); comedi_free_board_dev(dev); break; } mutex_unlock(&comedi_board_minor_table_lock); } } int comedi_alloc_subdevice_minor(struct comedi_subdevice *s) { struct comedi_device *dev = s->device; struct device *csdev; unsigned int i; mutex_lock(&comedi_subdevice_minor_table_lock); for (i = 0; i < COMEDI_NUM_SUBDEVICE_MINORS; ++i) { if (!comedi_subdevice_minor_table[i]) { comedi_subdevice_minor_table[i] = s; break; } } mutex_unlock(&comedi_subdevice_minor_table_lock); if (i == COMEDI_NUM_SUBDEVICE_MINORS) { dev_err(dev->class_dev, "ran out of minor numbers for subdevice files\n"); return -EBUSY; } i += COMEDI_NUM_BOARD_MINORS; s->minor = i; csdev = device_create(&comedi_class, dev->class_dev, MKDEV(COMEDI_MAJOR, i), NULL, "comedi%i_subd%i", dev->minor, s->index); if (!IS_ERR(csdev)) s->class_dev = csdev; return 0; } void comedi_free_subdevice_minor(struct comedi_subdevice *s) { unsigned int i; if (!s) return; if (s->minor < COMEDI_NUM_BOARD_MINORS || s->minor >= COMEDI_NUM_MINORS) return; i = s->minor - COMEDI_NUM_BOARD_MINORS; mutex_lock(&comedi_subdevice_minor_table_lock); if (s == comedi_subdevice_minor_table[i]) comedi_subdevice_minor_table[i] = NULL; mutex_unlock(&comedi_subdevice_minor_table_lock); if (s->class_dev) { device_destroy(&comedi_class, MKDEV(COMEDI_MAJOR, s->minor)); s->class_dev = NULL; } } static void comedi_cleanup_board_minors(void) { struct comedi_device *dev; unsigned int i; for (i = 0; i < COMEDI_NUM_BOARD_MINORS; i++) { dev = comedi_clear_board_minor(i); comedi_free_board_dev(dev); } } static int __init comedi_init(void) { int i; int retval; pr_info("version " COMEDI_RELEASE " - http://www.comedi.org\n"); if (comedi_num_legacy_minors > COMEDI_NUM_BOARD_MINORS) { pr_err("invalid value for module parameter \"comedi_num_legacy_minors\". Valid values are 0 through %i.\n", COMEDI_NUM_BOARD_MINORS); return -EINVAL; } retval = register_chrdev_region(MKDEV(COMEDI_MAJOR, 0), COMEDI_NUM_MINORS, "comedi"); if (retval) return retval; cdev_init(&comedi_cdev, &comedi_fops); comedi_cdev.owner = THIS_MODULE; retval = kobject_set_name(&comedi_cdev.kobj, "comedi"); if (retval) goto out_unregister_chrdev_region; retval = cdev_add(&comedi_cdev, MKDEV(COMEDI_MAJOR, 0), COMEDI_NUM_MINORS); if (retval) goto out_unregister_chrdev_region; retval = class_register(&comedi_class); if (retval) { pr_err("failed to create class\n"); goto out_cdev_del; } /* create devices files for legacy/manual use */ for (i = 0; i < comedi_num_legacy_minors; i++) { struct comedi_device *dev; dev = comedi_alloc_board_minor(NULL); if (IS_ERR(dev)) { retval = PTR_ERR(dev); goto out_cleanup_board_minors; } /* comedi_alloc_board_minor() locked the mutex */ lockdep_assert_held(&dev->mutex); mutex_unlock(&dev->mutex); } /* XXX requires /proc interface */ comedi_proc_init(); return 0; out_cleanup_board_minors: comedi_cleanup_board_minors(); class_unregister(&comedi_class); out_cdev_del: cdev_del(&comedi_cdev); out_unregister_chrdev_region: unregister_chrdev_region(MKDEV(COMEDI_MAJOR, 0), COMEDI_NUM_MINORS); return retval; } module_init(comedi_init); static void __exit comedi_cleanup(void) { comedi_cleanup_board_minors(); class_unregister(&comedi_class); cdev_del(&comedi_cdev); unregister_chrdev_region(MKDEV(COMEDI_MAJOR, 0), COMEDI_NUM_MINORS); comedi_proc_cleanup(); } module_exit(comedi_cleanup); MODULE_AUTHOR("https://www.comedi.org"); MODULE_DESCRIPTION("Comedi core module"); MODULE_LICENSE("GPL");
48 48 8 8 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/ethtool.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/bonding.h> #include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; bond_for_each_slave_rcu(bond, slave, iter) if (++off == *pos) return slave; return NULL; } static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; ++*pos; if (v == SEQ_START_TOKEN) return bond_first_slave_rcu(bond); bond_for_each_slave_rcu(bond, slave, iter) { if (found) return slave; if (slave == v) found = true; } return NULL; } static void bond_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; curr = rcu_dereference(bond->curr_active_slave); seq_printf(seq, "Bonding Mode: %s", bond_mode_name(BOND_MODE(bond))); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac) { optval = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); seq_printf(seq, " (fail_over_mac %s)", optval->string); } seq_printf(seq, "\n"); if (bond_mode_uses_xmit_hash(bond)) { optval = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); seq_printf(seq, "Transmit Hash Policy: %s (%d)\n", optval->string, bond->params.xmit_policy); } if (bond_uses_primary(bond)) { primary = rcu_dereference(bond->primary_slave); seq_printf(seq, "Primary Slave: %s", primary ? primary->dev->name : "None"); if (primary) { optval = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); seq_printf(seq, " (primary_reselect %s)", optval->string); } seq_printf(seq, "\nCurrently Active Slave: %s\n", (curr) ? curr->dev->name : "None"); } seq_printf(seq, "MII Status: %s\n", netif_carrier_ok(bond->dev) ? "up" : "down"); seq_printf(seq, "MII Polling Interval (ms): %d\n", bond->params.miimon); seq_printf(seq, "Up Delay (ms): %d\n", bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); seq_printf(seq, "Peer Notification Delay (ms): %d\n", bond->params.peer_notif_delay * bond->params.miimon); /* ARP information */ if (bond->params.arp_interval > 0) { int printed = 0; seq_printf(seq, "ARP Polling Interval (ms): %d\n", bond->params.arp_interval); seq_printf(seq, "ARP Missed Max: %u\n", bond->params.missed_max); seq_printf(seq, "ARP IP target/s (n.n.n.n form):"); for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { if (!bond->params.arp_targets[i]) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI4", &bond->params.arp_targets[i]); printed = 1; } seq_printf(seq, "\n"); #if IS_ENABLED(CONFIG_IPV6) printed = 0; seq_printf(seq, "NS IPv6 target/s (xx::xx form):"); for (i = 0; (i < BOND_MAX_NS_TARGETS); i++) { if (ipv6_addr_any(&bond->params.ns_targets[i])) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI6c", &bond->params.ns_targets[i]); printed = 1; } seq_printf(seq, "\n"); #endif } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP active: %s\n", (bond->params.lacp_active) ? "on" : "off"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); optval = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", optval->string); if (capable(CAP_NET_ADMIN)) { seq_printf(seq, "System priority: %d\n", BOND_AD_INFO(bond).system.sys_priority); seq_printf(seq, "System MAC address: %pM\n", &BOND_AD_INFO(bond).system.sys_mac_addr); if (__bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", bond->dev->name); } else { seq_printf(seq, "Active Aggregator Info:\n"); seq_printf(seq, "\tAggregator ID: %d\n", ad_info.aggregator_id); seq_printf(seq, "\tNumber of ports: %d\n", ad_info.ports); seq_printf(seq, "\tActor Key: %d\n", ad_info.actor_key); seq_printf(seq, "\tPartner Key: %d\n", ad_info.partner_key); seq_printf(seq, "\tPartner Mac Address: %pM\n", ad_info.partner_system); } } } } static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); if (slave->speed == SPEED_UNKNOWN) seq_printf(seq, "Speed: %s\n", "Unknown"); else seq_printf(seq, "Speed: %d Mbps\n", slave->speed); if (slave->duplex == DUPLEX_UNKNOWN) seq_printf(seq, "Duplex: %s\n", "Unknown"); else seq_printf(seq, "Duplex: %s\n", slave->duplex ? "full" : "half"); seq_printf(seq, "Link Failure Count: %u\n", slave->link_failure_count); seq_printf(seq, "Permanent HW addr: %*phC\n", slave->dev->addr_len, slave->perm_hwaddr); seq_printf(seq, "Slave queue ID: %d\n", READ_ONCE(slave->queue_id)); if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; const struct aggregator *agg = port->aggregator; if (agg) { seq_printf(seq, "Aggregator ID: %d\n", agg->aggregator_identifier); seq_printf(seq, "Actor Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_actor_state)); seq_printf(seq, "Partner Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_partner_state)); seq_printf(seq, "Actor Churned Count: %d\n", port->churn_actor_count); seq_printf(seq, "Partner Churned Count: %d\n", port->churn_partner_count); if (capable(CAP_NET_ADMIN)) { seq_puts(seq, "details actor lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->actor_system_priority); seq_printf(seq, " system mac address: %pM\n", &port->actor_system); seq_printf(seq, " port key: %d\n", port->actor_oper_port_key); seq_printf(seq, " port priority: %d\n", port->actor_port_priority); seq_printf(seq, " port number: %d\n", port->actor_port_number); seq_printf(seq, " port state: %d\n", port->actor_oper_port_state); seq_puts(seq, "details partner lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->partner_oper.system_priority); seq_printf(seq, " system mac address: %pM\n", &port->partner_oper.system); seq_printf(seq, " oper key: %d\n", port->partner_oper.key); seq_printf(seq, " port priority: %d\n", port->partner_oper.port_priority); seq_printf(seq, " port number: %d\n", port->partner_oper.port_number); seq_printf(seq, " port state: %d\n", port->partner_oper.port_state); } } else { seq_puts(seq, "Aggregator ID: N/A\n"); } } } static int bond_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "%s\n", bond_version); bond_info_show_master(seq); } else bond_info_show_slave(seq, v); return 0; } static const struct seq_operations bond_info_seq_ops = { .start = bond_info_seq_start, .next = bond_info_seq_next, .stop = bond_info_seq_stop, .show = bond_info_seq_show, }; void bond_create_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir) { bond->proc_entry = proc_create_seq_data(bond_dev->name, 0444, bn->proc_dir, &bond_info_seq_ops, bond); if (bond->proc_entry == NULL) netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n", DRV_NAME, bond_dev->name); else memcpy(bond->proc_file_name, bond_dev->name, IFNAMSIZ); } } void bond_remove_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir && bond->proc_entry) { remove_proc_entry(bond->proc_file_name, bn->proc_dir); memset(bond->proc_file_name, 0, IFNAMSIZ); bond->proc_entry = NULL; } } /* Create the bonding directory under /proc/net, if doesn't exist yet. * Caller must hold rtnl_lock. */ void __net_init bond_create_proc_dir(struct bond_net *bn) { if (!bn->proc_dir) { bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); if (!bn->proc_dir) pr_warn("Warning: Cannot create /proc/net/%s\n", DRV_NAME); } } /* Destroy the bonding directory under /proc/net, if empty. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { if (bn->proc_dir) { remove_proc_entry(DRV_NAME, bn->net->proc_net); bn->proc_dir = NULL; } }
2 2 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 2 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 // SPDX-License-Identifier: GPL-2.0-or-later /* * Etoms Et61x151 GPL Linux driver by Michel Xhaard (09/09/2004) * * V4L2 by Jean-Francois Moine <http://moinejf.free.fr> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "etoms" #include "gspca.h" MODULE_AUTHOR("Michel Xhaard <mxhaard@users.sourceforge.net>"); MODULE_DESCRIPTION("Etoms USB Camera Driver"); MODULE_LICENSE("GPL"); /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ unsigned char autogain; char sensor; #define SENSOR_PAS106 0 #define SENSOR_TAS5130CXX 1 signed char ag_cnt; #define AG_CNT_START 13 }; static const struct v4l2_pix_format vga_mode[] = { {320, 240, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 1}, /* {640, 480, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 640, .sizeimage = 640 * 480, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, */ }; static const struct v4l2_pix_format sif_mode[] = { {176, 144, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 176, .sizeimage = 176 * 144, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 1}, {352, 288, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, .bytesperline = 352, .sizeimage = 352 * 288, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, }; #define ETOMS_ALT_SIZE_1000 12 #define ET_GPIO_DIR_CTRL 0x04 /* Control IO bit[0..5] (0 in 1 out) */ #define ET_GPIO_OUT 0x05 /* Only IO data */ #define ET_GPIO_IN 0x06 /* Read Only IO data */ #define ET_RESET_ALL 0x03 #define ET_ClCK 0x01 #define ET_CTRL 0x02 /* enable i2c OutClck Powerdown mode */ #define ET_COMP 0x12 /* Compression register */ #define ET_MAXQt 0x13 #define ET_MINQt 0x14 #define ET_COMP_VAL0 0x02 #define ET_COMP_VAL1 0x03 #define ET_REG1d 0x1d #define ET_REG1e 0x1e #define ET_REG1f 0x1f #define ET_REG20 0x20 #define ET_REG21 0x21 #define ET_REG22 0x22 #define ET_REG23 0x23 #define ET_REG24 0x24 #define ET_REG25 0x25 /* base registers for luma calculation */ #define ET_LUMA_CENTER 0x39 #define ET_G_RED 0x4d #define ET_G_GREEN1 0x4e #define ET_G_BLUE 0x4f #define ET_G_GREEN2 0x50 #define ET_G_GR_H 0x51 #define ET_G_GB_H 0x52 #define ET_O_RED 0x34 #define ET_O_GREEN1 0x35 #define ET_O_BLUE 0x36 #define ET_O_GREEN2 0x37 #define ET_SYNCHRO 0x68 #define ET_STARTX 0x69 #define ET_STARTY 0x6a #define ET_WIDTH_LOW 0x6b #define ET_HEIGTH_LOW 0x6c #define ET_W_H_HEIGTH 0x6d #define ET_REG6e 0x6e /* OBW */ #define ET_REG6f 0x6f /* OBW */ #define ET_REG70 0x70 /* OBW_AWB */ #define ET_REG71 0x71 /* OBW_AWB */ #define ET_REG72 0x72 /* OBW_AWB */ #define ET_REG73 0x73 /* Clkdelay ns */ #define ET_REG74 0x74 /* test pattern */ #define ET_REG75 0x75 /* test pattern */ #define ET_I2C_CLK 0x8c #define ET_PXL_CLK 0x60 #define ET_I2C_BASE 0x89 #define ET_I2C_COUNT 0x8a #define ET_I2C_PREFETCH 0x8b #define ET_I2C_REG 0x88 #define ET_I2C_DATA7 0x87 #define ET_I2C_DATA6 0x86 #define ET_I2C_DATA5 0x85 #define ET_I2C_DATA4 0x84 #define ET_I2C_DATA3 0x83 #define ET_I2C_DATA2 0x82 #define ET_I2C_DATA1 0x81 #define ET_I2C_DATA0 0x80 #define PAS106_REG2 0x02 /* pxlClk = systemClk/(reg2) */ #define PAS106_REG3 0x03 /* line/frame H [11..4] */ #define PAS106_REG4 0x04 /* line/frame L [3..0] */ #define PAS106_REG5 0x05 /* exposure time line offset(default 5) */ #define PAS106_REG6 0x06 /* exposure time pixel offset(default 6) */ #define PAS106_REG7 0x07 /* signbit Dac (default 0) */ #define PAS106_REG9 0x09 #define PAS106_REG0e 0x0e /* global gain [4..0](default 0x0e) */ #define PAS106_REG13 0x13 /* end i2c write */ static const __u8 GainRGBG[] = { 0x80, 0x80, 0x80, 0x80, 0x00, 0x00 }; static const __u8 I2c2[] = { 0x08, 0x08, 0x08, 0x08, 0x0d }; static const __u8 I2c3[] = { 0x12, 0x05 }; static const __u8 I2c4[] = { 0x41, 0x08 }; /* read 'len' bytes to gspca_dev->usb_buf */ static void reg_r(struct gspca_dev *gspca_dev, __u16 index, __u16 len) { struct usb_device *dev = gspca_dev->dev; if (len > USB_BUF_SZ) { gspca_err(gspca_dev, "reg_r: buffer overflow\n"); return; } usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), 0, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 0, index, gspca_dev->usb_buf, len, 500); gspca_dbg(gspca_dev, D_USBI, "reg read [%02x] -> %02x ..\n", index, gspca_dev->usb_buf[0]); } static void reg_w_val(struct gspca_dev *gspca_dev, __u16 index, __u8 val) { struct usb_device *dev = gspca_dev->dev; gspca_dev->usb_buf[0] = val; usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 0, index, gspca_dev->usb_buf, 1, 500); } static void reg_w(struct gspca_dev *gspca_dev, __u16 index, const __u8 *buffer, __u16 len) { struct usb_device *dev = gspca_dev->dev; if (len > USB_BUF_SZ) { pr_err("reg_w: buffer overflow\n"); return; } gspca_dbg(gspca_dev, D_USBO, "reg write [%02x] = %02x..\n", index, *buffer); memcpy(gspca_dev->usb_buf, buffer, len); usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, 0, index, gspca_dev->usb_buf, len, 500); } static int i2c_w(struct gspca_dev *gspca_dev, __u8 reg, const __u8 *buffer, int len, __u8 mode) { /* buffer should be [D0..D7] */ __u8 ptchcount; /* set the base address */ reg_w_val(gspca_dev, ET_I2C_BASE, 0x40); /* sensor base for the pas106 */ /* set count and prefetch */ ptchcount = ((len & 0x07) << 4) | (mode & 0x03); reg_w_val(gspca_dev, ET_I2C_COUNT, ptchcount); /* set the register base */ reg_w_val(gspca_dev, ET_I2C_REG, reg); while (--len >= 0) reg_w_val(gspca_dev, ET_I2C_DATA0 + len, buffer[len]); return 0; } static int i2c_r(struct gspca_dev *gspca_dev, __u8 reg) { /* set the base address */ reg_w_val(gspca_dev, ET_I2C_BASE, 0x40); /* sensor base for the pas106 */ /* set count and prefetch (cnd: 4 bits - mode: 4 bits) */ reg_w_val(gspca_dev, ET_I2C_COUNT, 0x11); reg_w_val(gspca_dev, ET_I2C_REG, reg); /* set the register base */ reg_w_val(gspca_dev, ET_I2C_PREFETCH, 0x02); /* prefetch */ reg_w_val(gspca_dev, ET_I2C_PREFETCH, 0x00); reg_r(gspca_dev, ET_I2C_DATA0, 1); /* read one byte */ return 0; } static int Et_WaitStatus(struct gspca_dev *gspca_dev) { int retry = 10; while (retry--) { reg_r(gspca_dev, ET_ClCK, 1); if (gspca_dev->usb_buf[0] != 0) return 1; } return 0; } static int et_video(struct gspca_dev *gspca_dev, int on) { int ret; reg_w_val(gspca_dev, ET_GPIO_OUT, on ? 0x10 /* startvideo - set Bit5 */ : 0); /* stopvideo */ ret = Et_WaitStatus(gspca_dev); if (ret != 0) gspca_err(gspca_dev, "timeout video on/off\n"); return ret; } static void Et_init2(struct gspca_dev *gspca_dev) { __u8 value; static const __u8 FormLine[] = { 0x84, 0x03, 0x14, 0xf4, 0x01, 0x05 }; gspca_dbg(gspca_dev, D_STREAM, "Open Init2 ET\n"); reg_w_val(gspca_dev, ET_GPIO_DIR_CTRL, 0x2f); reg_w_val(gspca_dev, ET_GPIO_OUT, 0x10); reg_r(gspca_dev, ET_GPIO_IN, 1); reg_w_val(gspca_dev, ET_ClCK, 0x14); /* 0x14 // 0x16 enabled pattern */ reg_w_val(gspca_dev, ET_CTRL, 0x1b); /* compression et subsampling */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) value = ET_COMP_VAL1; /* 320 */ else value = ET_COMP_VAL0; /* 640 */ reg_w_val(gspca_dev, ET_COMP, value); reg_w_val(gspca_dev, ET_MAXQt, 0x1f); reg_w_val(gspca_dev, ET_MINQt, 0x04); /* undocumented registers */ reg_w_val(gspca_dev, ET_REG1d, 0xff); reg_w_val(gspca_dev, ET_REG1e, 0xff); reg_w_val(gspca_dev, ET_REG1f, 0xff); reg_w_val(gspca_dev, ET_REG20, 0x35); reg_w_val(gspca_dev, ET_REG21, 0x01); reg_w_val(gspca_dev, ET_REG22, 0x00); reg_w_val(gspca_dev, ET_REG23, 0xff); reg_w_val(gspca_dev, ET_REG24, 0xff); reg_w_val(gspca_dev, ET_REG25, 0x0f); /* colors setting */ reg_w_val(gspca_dev, 0x30, 0x11); /* 0x30 */ reg_w_val(gspca_dev, 0x31, 0x40); reg_w_val(gspca_dev, 0x32, 0x00); reg_w_val(gspca_dev, ET_O_RED, 0x00); /* 0x34 */ reg_w_val(gspca_dev, ET_O_GREEN1, 0x00); reg_w_val(gspca_dev, ET_O_BLUE, 0x00); reg_w_val(gspca_dev, ET_O_GREEN2, 0x00); /*************/ reg_w_val(gspca_dev, ET_G_RED, 0x80); /* 0x4d */ reg_w_val(gspca_dev, ET_G_GREEN1, 0x80); reg_w_val(gspca_dev, ET_G_BLUE, 0x80); reg_w_val(gspca_dev, ET_G_GREEN2, 0x80); reg_w_val(gspca_dev, ET_G_GR_H, 0x00); reg_w_val(gspca_dev, ET_G_GB_H, 0x00); /* 0x52 */ /* Window control registers */ reg_w_val(gspca_dev, 0x61, 0x80); /* use cmc_out */ reg_w_val(gspca_dev, 0x62, 0x02); reg_w_val(gspca_dev, 0x63, 0x03); reg_w_val(gspca_dev, 0x64, 0x14); reg_w_val(gspca_dev, 0x65, 0x0e); reg_w_val(gspca_dev, 0x66, 0x02); reg_w_val(gspca_dev, 0x67, 0x02); /**************************************/ reg_w_val(gspca_dev, ET_SYNCHRO, 0x8f); /* 0x68 */ reg_w_val(gspca_dev, ET_STARTX, 0x69); /* 0x6a //0x69 */ reg_w_val(gspca_dev, ET_STARTY, 0x0d); /* 0x0d //0x0c */ reg_w_val(gspca_dev, ET_WIDTH_LOW, 0x80); reg_w_val(gspca_dev, ET_HEIGTH_LOW, 0xe0); reg_w_val(gspca_dev, ET_W_H_HEIGTH, 0x60); /* 6d */ reg_w_val(gspca_dev, ET_REG6e, 0x86); reg_w_val(gspca_dev, ET_REG6f, 0x01); reg_w_val(gspca_dev, ET_REG70, 0x26); reg_w_val(gspca_dev, ET_REG71, 0x7a); reg_w_val(gspca_dev, ET_REG72, 0x01); /* Clock Pattern registers ***************** */ reg_w_val(gspca_dev, ET_REG73, 0x00); reg_w_val(gspca_dev, ET_REG74, 0x18); /* 0x28 */ reg_w_val(gspca_dev, ET_REG75, 0x0f); /* 0x01 */ /**********************************************/ reg_w_val(gspca_dev, 0x8a, 0x20); reg_w_val(gspca_dev, 0x8d, 0x0f); reg_w_val(gspca_dev, 0x8e, 0x08); /**************************************/ reg_w_val(gspca_dev, 0x03, 0x08); reg_w_val(gspca_dev, ET_PXL_CLK, 0x03); reg_w_val(gspca_dev, 0x81, 0xff); reg_w_val(gspca_dev, 0x80, 0x00); reg_w_val(gspca_dev, 0x81, 0xff); reg_w_val(gspca_dev, 0x80, 0x20); reg_w_val(gspca_dev, 0x03, 0x01); reg_w_val(gspca_dev, 0x03, 0x00); reg_w_val(gspca_dev, 0x03, 0x08); /********************************************/ /* reg_r(gspca_dev, ET_I2C_BASE, 1); always 0x40 as the pas106 ??? */ /* set the sensor */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) value = 0x04; /* 320 */ else /* 640 */ value = 0x1e; /* 0x17 * setting PixelClock * 0x03 mean 24/(3+1) = 6 Mhz * 0x05 -> 24/(5+1) = 4 Mhz * 0x0b -> 24/(11+1) = 2 Mhz * 0x17 -> 24/(23+1) = 1 Mhz */ reg_w_val(gspca_dev, ET_PXL_CLK, value); /* now set by fifo the FormatLine setting */ reg_w(gspca_dev, 0x62, FormLine, 6); /* set exposure times [ 0..0x78] 0->longvalue 0x78->shortvalue */ reg_w_val(gspca_dev, 0x81, 0x47); /* 0x47; */ reg_w_val(gspca_dev, 0x80, 0x40); /* 0x40; */ /* Pedro change */ /* Brightness change Brith+ decrease value */ /* Brigth- increase value */ /* original value = 0x70; */ reg_w_val(gspca_dev, 0x81, 0x30); /* 0x20; - set brightness */ reg_w_val(gspca_dev, 0x80, 0x20); /* 0x20; */ } static void setbrightness(struct gspca_dev *gspca_dev, s32 val) { int i; for (i = 0; i < 4; i++) reg_w_val(gspca_dev, ET_O_RED + i, val); } static void setcontrast(struct gspca_dev *gspca_dev, s32 val) { __u8 RGBG[] = { 0x80, 0x80, 0x80, 0x80, 0x00, 0x00 }; memset(RGBG, val, sizeof(RGBG) - 2); reg_w(gspca_dev, ET_G_RED, RGBG, 6); } static void setcolors(struct gspca_dev *gspca_dev, s32 val) { struct sd *sd = (struct sd *) gspca_dev; __u8 I2cc[] = { 0x05, 0x02, 0x02, 0x05, 0x0d }; __u8 i2cflags = 0x01; /* __u8 green = 0; */ I2cc[3] = val; /* red */ I2cc[0] = 15 - val; /* blue */ /* green = 15 - ((((7*I2cc[0]) >> 2 ) + I2cc[3]) >> 1); */ /* I2cc[1] = I2cc[2] = green; */ if (sd->sensor == SENSOR_PAS106) { i2c_w(gspca_dev, PAS106_REG13, &i2cflags, 1, 3); i2c_w(gspca_dev, PAS106_REG9, I2cc, sizeof I2cc, 1); } } static s32 getcolors(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) { /* i2c_r(gspca_dev, PAS106_REG9); * blue */ i2c_r(gspca_dev, PAS106_REG9 + 3); /* red */ return gspca_dev->usb_buf[0] & 0x0f; } return 0; } static void setautogain(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->autogain) sd->ag_cnt = AG_CNT_START; else sd->ag_cnt = -1; } static void Et_init1(struct gspca_dev *gspca_dev) { __u8 value; /* __u8 I2c0 [] = {0x0a, 0x12, 0x05, 0x22, 0xac, 0x00, 0x01, 0x00}; */ __u8 I2c0[] = { 0x0a, 0x12, 0x05, 0x6d, 0xcd, 0x00, 0x01, 0x00 }; /* try 1/120 0x6d 0xcd 0x40 */ /* __u8 I2c0 [] = {0x0a, 0x12, 0x05, 0xfe, 0xfe, 0xc0, 0x01, 0x00}; * 1/60000 hmm ?? */ gspca_dbg(gspca_dev, D_STREAM, "Open Init1 ET\n\n"); reg_w_val(gspca_dev, ET_GPIO_DIR_CTRL, 7); reg_r(gspca_dev, ET_GPIO_IN, 1); reg_w_val(gspca_dev, ET_RESET_ALL, 1); reg_w_val(gspca_dev, ET_RESET_ALL, 0); reg_w_val(gspca_dev, ET_ClCK, 0x10); reg_w_val(gspca_dev, ET_CTRL, 0x19); /* compression et subsampling */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) value = ET_COMP_VAL1; else value = ET_COMP_VAL0; gspca_dbg(gspca_dev, D_STREAM, "Open mode %d Compression %d\n", gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv, value); reg_w_val(gspca_dev, ET_COMP, value); reg_w_val(gspca_dev, ET_MAXQt, 0x1d); reg_w_val(gspca_dev, ET_MINQt, 0x02); /* undocumented registers */ reg_w_val(gspca_dev, ET_REG1d, 0xff); reg_w_val(gspca_dev, ET_REG1e, 0xff); reg_w_val(gspca_dev, ET_REG1f, 0xff); reg_w_val(gspca_dev, ET_REG20, 0x35); reg_w_val(gspca_dev, ET_REG21, 0x01); reg_w_val(gspca_dev, ET_REG22, 0x00); reg_w_val(gspca_dev, ET_REG23, 0xf7); reg_w_val(gspca_dev, ET_REG24, 0xff); reg_w_val(gspca_dev, ET_REG25, 0x07); /* colors setting */ reg_w_val(gspca_dev, ET_G_RED, 0x80); reg_w_val(gspca_dev, ET_G_GREEN1, 0x80); reg_w_val(gspca_dev, ET_G_BLUE, 0x80); reg_w_val(gspca_dev, ET_G_GREEN2, 0x80); reg_w_val(gspca_dev, ET_G_GR_H, 0x00); reg_w_val(gspca_dev, ET_G_GB_H, 0x00); /* Window control registers */ reg_w_val(gspca_dev, ET_SYNCHRO, 0xf0); reg_w_val(gspca_dev, ET_STARTX, 0x56); /* 0x56 */ reg_w_val(gspca_dev, ET_STARTY, 0x05); /* 0x04 */ reg_w_val(gspca_dev, ET_WIDTH_LOW, 0x60); reg_w_val(gspca_dev, ET_HEIGTH_LOW, 0x20); reg_w_val(gspca_dev, ET_W_H_HEIGTH, 0x50); reg_w_val(gspca_dev, ET_REG6e, 0x86); reg_w_val(gspca_dev, ET_REG6f, 0x01); reg_w_val(gspca_dev, ET_REG70, 0x86); reg_w_val(gspca_dev, ET_REG71, 0x14); reg_w_val(gspca_dev, ET_REG72, 0x00); /* Clock Pattern registers */ reg_w_val(gspca_dev, ET_REG73, 0x00); reg_w_val(gspca_dev, ET_REG74, 0x00); reg_w_val(gspca_dev, ET_REG75, 0x0a); reg_w_val(gspca_dev, ET_I2C_CLK, 0x04); reg_w_val(gspca_dev, ET_PXL_CLK, 0x01); /* set the sensor */ if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv) { I2c0[0] = 0x06; i2c_w(gspca_dev, PAS106_REG2, I2c0, sizeof I2c0, 1); i2c_w(gspca_dev, PAS106_REG9, I2c2, sizeof I2c2, 1); value = 0x06; i2c_w(gspca_dev, PAS106_REG2, &value, 1, 1); i2c_w(gspca_dev, PAS106_REG3, I2c3, sizeof I2c3, 1); /* value = 0x1f; */ value = 0x04; i2c_w(gspca_dev, PAS106_REG0e, &value, 1, 1); } else { I2c0[0] = 0x0a; i2c_w(gspca_dev, PAS106_REG2, I2c0, sizeof I2c0, 1); i2c_w(gspca_dev, PAS106_REG9, I2c2, sizeof I2c2, 1); value = 0x0a; i2c_w(gspca_dev, PAS106_REG2, &value, 1, 1); i2c_w(gspca_dev, PAS106_REG3, I2c3, sizeof I2c3, 1); value = 0x04; /* value = 0x10; */ i2c_w(gspca_dev, PAS106_REG0e, &value, 1, 1); /* bit 2 enable bit 1:2 select 0 1 2 3 value = 0x07; * curve 0 * i2c_w(gspca_dev, PAS106_REG0f, &value, 1, 1); */ } /* value = 0x01; */ /* value = 0x22; */ /* i2c_w(gspca_dev, PAS106_REG5, &value, 1, 1); */ /* magnetude and sign bit for DAC */ i2c_w(gspca_dev, PAS106_REG7, I2c4, sizeof I2c4, 1); /* now set by fifo the whole colors setting */ reg_w(gspca_dev, ET_G_RED, GainRGBG, 6); setcolors(gspca_dev, getcolors(gspca_dev)); } /* this function is called at probe time */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct sd *sd = (struct sd *) gspca_dev; struct cam *cam; cam = &gspca_dev->cam; sd->sensor = id->driver_info; if (sd->sensor == SENSOR_PAS106) { cam->cam_mode = sif_mode; cam->nmodes = ARRAY_SIZE(sif_mode); } else { cam->cam_mode = vga_mode; cam->nmodes = ARRAY_SIZE(vga_mode); } sd->ag_cnt = -1; return 0; } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) Et_init1(gspca_dev); else Et_init2(gspca_dev); reg_w_val(gspca_dev, ET_RESET_ALL, 0x08); et_video(gspca_dev, 0); /* video off */ return 0; } /* -- start the camera -- */ static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) Et_init1(gspca_dev); else Et_init2(gspca_dev); setautogain(gspca_dev); reg_w_val(gspca_dev, ET_RESET_ALL, 0x08); et_video(gspca_dev, 1); /* video on */ return 0; } static void sd_stopN(struct gspca_dev *gspca_dev) { et_video(gspca_dev, 0); /* video off */ } static __u8 Et_getgainG(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) { i2c_r(gspca_dev, PAS106_REG0e); gspca_dbg(gspca_dev, D_CONF, "Etoms gain G %d\n", gspca_dev->usb_buf[0]); return gspca_dev->usb_buf[0]; } return 0x1f; } static void Et_setgainG(struct gspca_dev *gspca_dev, __u8 gain) { struct sd *sd = (struct sd *) gspca_dev; if (sd->sensor == SENSOR_PAS106) { __u8 i2cflags = 0x01; i2c_w(gspca_dev, PAS106_REG13, &i2cflags, 1, 3); i2c_w(gspca_dev, PAS106_REG0e, &gain, 1, 1); } } #define BLIMIT(bright) \ (u8)((bright > 0x1f) ? 0x1f : ((bright < 4) ? 3 : bright)) #define LIMIT(color) \ (u8)((color > 0xff) ? 0xff : ((color < 0) ? 0 : color)) static void do_autogain(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; __u8 luma; __u8 luma_mean = 128; __u8 luma_delta = 20; __u8 spring = 4; int Gbright; __u8 r, g, b; if (sd->ag_cnt < 0) return; if (--sd->ag_cnt >= 0) return; sd->ag_cnt = AG_CNT_START; Gbright = Et_getgainG(gspca_dev); reg_r(gspca_dev, ET_LUMA_CENTER, 4); g = (gspca_dev->usb_buf[0] + gspca_dev->usb_buf[3]) >> 1; r = gspca_dev->usb_buf[1]; b = gspca_dev->usb_buf[2]; r = ((r << 8) - (r << 4) - (r << 3)) >> 10; b = ((b << 7) >> 10); g = ((g << 9) + (g << 7) + (g << 5)) >> 10; luma = LIMIT(r + g + b); gspca_dbg(gspca_dev, D_FRAM, "Etoms luma G %d\n", luma); if (luma < luma_mean - luma_delta || luma > luma_mean + luma_delta) { Gbright += (luma_mean - luma) >> spring; Gbright = BLIMIT(Gbright); gspca_dbg(gspca_dev, D_FRAM, "Etoms Gbright %d\n", Gbright); Et_setgainG(gspca_dev, (__u8) Gbright); } } #undef BLIMIT #undef LIMIT static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { int seqframe; seqframe = data[0] & 0x3f; len = (int) (((data[0] & 0xc0) << 2) | data[1]); if (seqframe == 0x3f) { gspca_dbg(gspca_dev, D_FRAM, "header packet found datalength %d !!\n", len); gspca_dbg(gspca_dev, D_FRAM, "G %d R %d G %d B %d", data[2], data[3], data[4], data[5]); data += 30; /* don't change datalength as the chips provided it */ gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); gspca_frame_add(gspca_dev, FIRST_PACKET, data, len); return; } if (len) { data += 8; gspca_frame_add(gspca_dev, INTER_PACKET, data, len); } else { /* Drop Packet */ gspca_dev->last_packet_type = DISCARD_PACKET; } } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *)gspca_dev; gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: setbrightness(gspca_dev, ctrl->val); break; case V4L2_CID_CONTRAST: setcontrast(gspca_dev, ctrl->val); break; case V4L2_CID_SATURATION: setcolors(gspca_dev, ctrl->val); break; case V4L2_CID_AUTOGAIN: sd->autogain = ctrl->val; setautogain(gspca_dev); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *)gspca_dev; struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 4); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BRIGHTNESS, 1, 127, 1, 63); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_CONTRAST, 0, 255, 1, 127); if (sd->sensor == SENSOR_PAS106) v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_SATURATION, 0, 15, 1, 7); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_AUTOGAIN, 0, 1, 1, 1); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, .dq_callback = do_autogain, }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x102c, 0x6151), .driver_info = SENSOR_PAS106}, {USB_DEVICE(0x102c, 0x6251), .driver_info = SENSOR_TAS5130CXX}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver);
13 13 1 14 13 1 1 14 13 14 14 13 14 14 9 4 4 4 4 10 10 10 10 9 10 10 10 10 10 1 1 3 2 1 7 5 2 7 2 5 4 1 4 14 1 12 1 10 1 1 1 1 1 9 2 1 7 6 3 4 4 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * The filters are packed to hash tables of key nodes * with a set of 32bit key/mask pairs at every node. * Nodes reference next level hash tables etc. * * This scheme is the best universal classifier I managed to * invent; it is not super-fast, but it is not slow (provided you * program it correctly), and general enough. And its relative * speed grows as the number of rules becomes larger. * * It seems that it represents the best middle point between * speed and manageability both by human and by machine. * * It is especially useful for link sharing combined with QoS; * pure RSVP doesn't need such a general approach and can use * much simpler (and faster) schemes, sort of cls_rsvp.c. * * nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/percpu.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/bitmap.h> #include <linux/netdevice.h> #include <linux/hash.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <linux/idr.h> #include <net/tc_wrapper.h> struct tc_u_knode { struct tc_u_knode __rcu *next; u32 handle; struct tc_u_hnode __rcu *ht_up; struct tcf_exts exts; int ifindex; u8 fshift; struct tcf_result res; struct tc_u_hnode __rcu *ht_down; #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt __percpu *pf; #endif u32 flags; unsigned int in_hw_count; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; u32 __percpu *pcpu_success; #endif struct rcu_work rwork; /* The 'sel' field MUST be the last field in structure to allow for * tc_u32_keys allocated at end of structure. */ struct tc_u32_sel sel; }; struct tc_u_hnode { struct tc_u_hnode __rcu *next; u32 handle; u32 prio; refcount_t refcnt; unsigned int divisor; struct idr handle_idr; bool is_root; struct rcu_head rcu; u32 flags; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. */ struct tc_u_knode __rcu *ht[]; }; struct tc_u_common { struct tc_u_hnode __rcu *hlist; void *ptr; refcount_t refcnt; struct idr handle_idr; struct hlist_node hnode; long knodes; }; static u32 handle2id(u32 h) { return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h); } static u32 id2handle(u32 id) { return (id | 0x800U) << 20; } static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) { unsigned int h = ntohl(key & sel->hmask) >> fshift; return h; } TC_INDIRECT_SCOPE int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct { struct tc_u_knode *knode; unsigned int off; } stack[TC_U32_MAXDEPTH]; struct tc_u_hnode *ht = rcu_dereference_bh(tp->root); unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; int off2 = 0; int sel = 0; #ifdef CONFIG_CLS_U32_PERF int j; #endif int i, r; next_ht: n = rcu_dereference_bh(ht->ht[sel]); next_knode: if (n) { struct tc_u32_key *key = n->sel.keys; #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rcnt); j = 0; #endif if (tc_skip_sw(n->flags)) { n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_MARK if ((skb->mark & n->mask) != n->val) { n = rcu_dereference_bh(n->next); goto next_knode; } else { __this_cpu_inc(*n->pcpu_success); } #endif for (i = n->sel.nkeys; i > 0; i--, key++) { int toff = off + key->off + (off2 & key->offmask); __be32 *data, hdata; if (skb_headroom(skb) + toff > INT_MAX) goto out; data = skb_header_pointer(skb, toff, 4, &hdata); if (!data) goto out; if ((*data ^ key->val) & key->mask) { n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->kcnts[j]); j++; #endif } ht = rcu_dereference_bh(n->ht_down); if (!ht) { check_terminal: if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; if (!tcf_match_indev(skb, n->ifindex)) { n = rcu_dereference_bh(n->next); goto next_knode; } #ifdef CONFIG_CLS_U32_PERF __this_cpu_inc(n->pf->rhit); #endif r = tcf_exts_exec(skb, &n->exts, res); if (r < 0) { n = rcu_dereference_bh(n->next); goto next_knode; } return r; } n = rcu_dereference_bh(n->next); goto next_knode; } /* PUSH */ if (sdepth >= TC_U32_MAXDEPTH) goto deadloop; stack[sdepth].knode = n; stack[sdepth].off = off; sdepth++; ht = rcu_dereference_bh(n->ht_down); sel = 0; if (ht->divisor) { __be32 *data, hdata; data = skb_header_pointer(skb, off + n->sel.hoff, 4, &hdata); if (!data) goto out; sel = ht->divisor & u32_hash_fold(*data, &n->sel, n->fshift); } if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT))) goto next_ht; if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) { off2 = n->sel.off + 3; if (n->sel.flags & TC_U32_VAROFFSET) { __be16 *data, hdata; data = skb_header_pointer(skb, off + n->sel.offoff, 2, &hdata); if (!data) goto out; off2 += ntohs(n->sel.offmask & *data) >> n->sel.offshift; } off2 &= ~3; } if (n->sel.flags & TC_U32_EAT) { off += off2; off2 = 0; } if (off < skb->len) goto next_ht; } /* POP */ if (sdepth--) { n = stack[sdepth].knode; ht = rcu_dereference_bh(n->ht_up); off = stack[sdepth].off; goto check_terminal; } out: return -1; deadloop: net_warn_ratelimited("cls_u32: dead loop\n"); return -1; } static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; for (ht = rtnl_dereference(tp_c->hlist); ht; ht = rtnl_dereference(ht->next)) if (ht->handle == handle) break; return ht; } static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle) { unsigned int sel; struct tc_u_knode *n = NULL; sel = TC_U32_HASH(handle); if (sel > ht->divisor) goto out; for (n = rtnl_dereference(ht->ht[sel]); n; n = rtnl_dereference(n->next)) if (n->handle == handle) break; out: return n; } static void *u32_get(struct tcf_proto *tp, u32 handle) { struct tc_u_hnode *ht; struct tc_u_common *tp_c = tp->data; if (TC_U32_HTID(handle) == TC_U32_ROOT) ht = rtnl_dereference(tp->root); else ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); if (!ht) return NULL; if (TC_U32_KEY(handle) == 0) return ht; return u32_lookup_key(ht, handle); } /* Protected by rtnl lock */ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) { int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL); if (id < 0) return 0; return id2handle(id); } static struct hlist_head *tc_u_common_hash; #define U32_HASH_SHIFT 10 #define U32_HASH_SIZE (1 << U32_HASH_SHIFT) static void *tc_u_common_ptr(const struct tcf_proto *tp) { struct tcf_block *block = tp->chain->block; /* The block sharing is currently supported only * for classless qdiscs. In that case we use block * for tc_u_common identification. In case the * block is not shared, block->q is a valid pointer * and we can use that. That works for classful qdiscs. */ if (tcf_block_shared(block)) return block; else return block->q; } static struct hlist_head *tc_u_hash(void *key) { return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT); } static struct tc_u_common *tc_u_common_find(void *key) { struct tc_u_common *tc; hlist_for_each_entry(tc, tc_u_hash(key), hnode) { if (tc->ptr == key) return tc; } return NULL; } static int u32_init(struct tcf_proto *tp) { struct tc_u_hnode *root_ht; void *key = tc_u_common_ptr(tp); struct tc_u_common *tp_c = tc_u_common_find(key); root_ht = kzalloc(struct_size(root_ht, ht, 1), GFP_KERNEL); if (root_ht == NULL) return -ENOBUFS; refcount_set(&root_ht->refcnt, 1); root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0); root_ht->prio = tp->prio; root_ht->is_root = true; idr_init(&root_ht->handle_idr); if (tp_c == NULL) { tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); if (tp_c == NULL) { kfree(root_ht); return -ENOBUFS; } refcount_set(&tp_c->refcnt, 1); tp_c->ptr = key; INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); hlist_add_head(&tp_c->hnode, tc_u_hash(key)); } else { refcount_inc(&tp_c->refcnt); } RCU_INIT_POINTER(root_ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, root_ht); /* root_ht must be destroyed when tcf_proto is destroyed */ rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; } static void __u32_destroy_key(struct tc_u_knode *n) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); tcf_exts_destroy(&n->exts); if (ht && refcount_dec_and_test(&ht->refcnt)) kfree(ht); kfree(n); } static void u32_destroy_key(struct tc_u_knode *n, bool free_pf) { tcf_exts_put_net(&n->exts); #ifdef CONFIG_CLS_U32_PERF if (free_pf) free_percpu(n->pf); #endif #ifdef CONFIG_CLS_U32_MARK if (free_pf) free_percpu(n->pcpu_success); #endif __u32_destroy_key(n); } /* u32_delete_key_rcu should be called when free'ing a copied * version of a tc_u_knode obtained from u32_init_knode(). When * copies are obtained from u32_init_knode() the statistics are * shared between the old and new copies to allow readers to * continue to update the statistics during the copy. To support * this the u32_delete_key_rcu variant does not free the percpu * statistics. */ static void u32_delete_key_work(struct work_struct *work) { struct tc_u_knode *key = container_of(to_rcu_work(work), struct tc_u_knode, rwork); rtnl_lock(); u32_destroy_key(key, false); rtnl_unlock(); } /* u32_delete_key_freepf_rcu is the rcu callback variant * that free's the entire structure including the statistics * percpu variables. Only use this if the key is not a copy * returned by u32_init_knode(). See u32_delete_key_rcu() * for the variant that should be used with keys return from * u32_init_knode() */ static void u32_delete_key_freepf_work(struct work_struct *work) { struct tc_u_knode *key = container_of(to_rcu_work(work), struct tc_u_knode, rwork); rtnl_lock(); u32_destroy_key(key, true); rtnl_unlock(); } static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { struct tc_u_common *tp_c = tp->data; struct tc_u_knode __rcu **kp; struct tc_u_knode *pkp; struct tc_u_hnode *ht = rtnl_dereference(key->ht_up); if (ht) { kp = &ht->ht[TC_U32_HASH(key->handle)]; for (pkp = rtnl_dereference(*kp); pkp; kp = &pkp->next, pkp = rtnl_dereference(*kp)) { if (pkp == key) { RCU_INIT_POINTER(*kp, key->next); tp_c->knodes--; tcf_unbind_filter(tp, &key->res); idr_remove(&ht->handle_idr, key->handle); tcf_exts_get_net(&key->exts); tcf_queue_work(&key->rwork, u32_delete_key_freepf_work); return 0; } } } WARN_ON(1); return 0; } static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; tc_cls_common_offload_init(&cls_u32.common, tp, h->flags, extack); cls_u32.command = TC_CLSU32_DELETE_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, false, true); } static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, u32 flags, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; bool skip_sw = tc_skip_sw(flags); bool offloaded = false; int err; tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack); cls_u32.command = TC_CLSU32_NEW_HNODE; cls_u32.hnode.divisor = h->divisor; cls_u32.hnode.handle = h->handle; cls_u32.hnode.prio = h->prio; err = tc_setup_cb_call(block, TC_SETUP_CLSU32, &cls_u32, skip_sw, true); if (err < 0) { u32_clear_hw_hnode(tp, h, NULL); return err; } else if (err > 0) { offloaded = true; } if (skip_sw && !offloaded) return -EINVAL; return 0; } static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); cls_u32.command = TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = n->handle; tc_setup_cb_destroy(block, tp, TC_SETUP_CLSU32, &cls_u32, false, &n->flags, &n->in_hw_count, true); } static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32 flags, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; bool skip_sw = tc_skip_sw(flags); int err; tc_cls_common_offload_init(&cls_u32.common, tp, flags, extack); cls_u32.command = TC_CLSU32_REPLACE_KNODE; cls_u32.knode.handle = n->handle; cls_u32.knode.fshift = n->fshift; #ifdef CONFIG_CLS_U32_MARK cls_u32.knode.val = n->val; cls_u32.knode.mask = n->mask; #else cls_u32.knode.val = 0; cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; err = tc_setup_cb_add(block, tp, TC_SETUP_CLSU32, &cls_u32, skip_sw, &n->flags, &n->in_hw_count, true); if (err) { u32_remove_hw_knode(tp, n, NULL); return err; } if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; } static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_knode *n; unsigned int h; for (h = 0; h <= ht->divisor; h++) { while ((n = rtnl_dereference(ht->ht[h])) != NULL) { RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); tp_c->knodes--; tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n, extack); idr_remove(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) tcf_queue_work(&n->rwork, u32_delete_key_freepf_work); else u32_destroy_key(n, true); } } } static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode __rcu **hn; struct tc_u_hnode *phn; u32_clear_hnode(tp, ht, extack); hn = &tp_c->hlist; for (phn = rtnl_dereference(*hn); phn; hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { u32_clear_hw_hnode(tp, ht, extack); idr_destroy(&ht->handle_idr); idr_remove(&tp_c->handle_idr, handle2id(ht->handle)); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; } } return -ENOENT; } static void u32_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); WARN_ON(root_ht == NULL); if (root_ht && refcount_dec_and_test(&root_ht->refcnt)) u32_destroy_hnode(tp, root_ht, extack); if (refcount_dec_and_test(&tp_c->refcnt)) { struct tc_u_hnode *ht; hlist_del(&tp_c->hnode); while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { u32_clear_hnode(tp, ht, extack); RCU_INIT_POINTER(tp_c->hlist, ht->next); /* u32_destroy_key() will later free ht for us, if it's * still referenced by some knode */ if (refcount_dec_and_test(&ht->refcnt)) kfree_rcu(ht, rcu); } idr_destroy(&tp_c->handle_idr); kfree(tp_c); } tp->data = NULL; } static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = arg; struct tc_u_common *tp_c = tp->data; int ret = 0; if (TC_U32_KEY(ht->handle)) { u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack); ret = u32_delete_key(tp, (struct tc_u_knode *)ht); goto out; } if (ht->is_root) { NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node"); return -EINVAL; } if (refcount_dec_if_one(&ht->refcnt)) { u32_destroy_hnode(tp, ht, extack); } else { NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter"); return -EBUSY; } out: *last = refcount_read(&tp_c->refcnt) == 1 && tp_c->knodes == 0; return ret; } static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid) { u32 index = htid | 0x800; u32 max = htid | 0xFFF; if (idr_alloc_u32(&ht->handle_idr, NULL, &index, max, GFP_KERNEL)) { index = htid + 1; if (idr_alloc_u32(&ht->handle_idr, NULL, &index, max, GFP_KERNEL)) index = max; } return index; } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_CLASSID] = { .type = NLA_U32 }, [TCA_U32_HASH] = { .type = NLA_U32 }, [TCA_U32_LINK] = { .type = NLA_U32 }, [TCA_U32_DIVISOR] = { .type = NLA_U32 }, [TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) }, [TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) }, [TCA_U32_FLAGS] = { .type = NLA_U32 }, }; static void u32_unbind_filter(struct tcf_proto *tp, struct tc_u_knode *n, struct nlattr **tb) { if (tb[TCA_U32_CLASSID]) tcf_unbind_filter(tp, &n->res); } static void u32_bind_filter(struct tcf_proto *tp, struct tc_u_knode *n, unsigned long base, struct nlattr **tb) { if (tb[TCA_U32_CLASSID]) { n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]); tcf_bind_filter(tp, &n->res, base); } } static int u32_set_parms(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) { int err, ifindex = -1; err = tcf_exts_validate_ex(net, tp, tb, est, &n->exts, flags, fl_flags, extack); if (err < 0) return err; if (tb[TCA_U32_INDEV]) { ifindex = tcf_change_indev(net, tb[TCA_U32_INDEV], extack); if (ifindex < 0) return -EINVAL; } if (tb[TCA_U32_LINK]) { u32 handle = nla_get_u32(tb[TCA_U32_LINK]); struct tc_u_hnode *ht_down = NULL, *ht_old; if (TC_U32_KEY(handle)) { NL_SET_ERR_MSG_MOD(extack, "u32 Link handle must be a hash table"); return -EINVAL; } if (handle) { ht_down = u32_lookup_ht(tp->data, handle); if (!ht_down) { NL_SET_ERR_MSG_MOD(extack, "Link hash table not found"); return -EINVAL; } if (ht_down->is_root) { NL_SET_ERR_MSG_MOD(extack, "Not linking to root node"); return -EINVAL; } refcount_inc(&ht_down->refcnt); } ht_old = rtnl_dereference(n->ht_down); rcu_assign_pointer(n->ht_down, ht_down); if (ht_old) refcount_dec(&ht_old->refcnt); } if (ifindex >= 0) n->ifindex = ifindex; return 0; } static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, struct tc_u_knode *n) { struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; struct tc_u_hnode *ht; if (TC_U32_HTID(n->handle) == TC_U32_ROOT) ht = rtnl_dereference(tp->root); else ht = u32_lookup_ht(tp_c, TC_U32_HTID(n->handle)); ins = &ht->ht[TC_U32_HASH(n->handle)]; /* The node must always exist for it to be replaced if this is not the * case then something went very wrong elsewhere. */ for (pins = rtnl_dereference(*ins); ; ins = &pins->next, pins = rtnl_dereference(*ins)) if (pins->handle == n->handle) break; idr_replace(&ht->handle_idr, n, n->handle); RCU_INIT_POINTER(n->next, pins->next); rcu_assign_pointer(*ins, n); } static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tc_u32_sel *s = &n->sel; struct tc_u_knode *new; new = kzalloc(struct_size(new, sel.keys, s->nkeys), GFP_KERNEL); if (!new) return NULL; RCU_INIT_POINTER(new->next, n->next); new->handle = n->handle; RCU_INIT_POINTER(new->ht_up, n->ht_up); new->ifindex = n->ifindex; new->fshift = n->fshift; new->flags = n->flags; RCU_INIT_POINTER(new->ht_down, ht); #ifdef CONFIG_CLS_U32_PERF /* Statistics may be incremented by readers during update * so we must keep them in tact. When the node is later destroyed * a special destroy call must be made to not free the pf memory. */ new->pf = n->pf; #endif #ifdef CONFIG_CLS_U32_MARK new->val = n->val; new->mask = n->mask; /* Similarly success statistics must be moved as pointers */ new->pcpu_success = n->pcpu_success; #endif memcpy(&new->sel, s, struct_size(s, keys, s->nkeys)); if (tcf_exts_init(&new->exts, net, TCA_U32_ACT, TCA_U32_POLICE)) { kfree(new); return NULL; } /* bump reference count as long as we hold pointer to structure */ if (ht) refcount_inc(&ht->refcnt); return new; } static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; struct tc_u32_sel *s; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_U32_MAX + 1]; u32 htid, userflags = 0; size_t sel_size; int err; if (!opt) { if (handle) { NL_SET_ERR_MSG_MOD(extack, "Filter handle requires options"); return -EINVAL; } else { return 0; } } err = nla_parse_nested_deprecated(tb, TCA_U32_MAX, opt, u32_policy, extack); if (err < 0) return err; if (tb[TCA_U32_FLAGS]) { userflags = nla_get_u32(tb[TCA_U32_FLAGS]); if (!tc_flags_valid(userflags)) { NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags"); return -EINVAL; } } n = *arg; if (n) { struct tc_u_knode *new; if (TC_U32_KEY(n->handle) == 0) { NL_SET_ERR_MSG_MOD(extack, "Key node id cannot be zero"); return -EINVAL; } if ((n->flags ^ userflags) & ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) { NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags"); return -EINVAL; } new = u32_init_knode(net, tp, n); if (!new) return -ENOMEM; err = u32_set_parms(net, tp, new, tb, tca[TCA_RATE], flags, new->flags, extack); if (err) { __u32_destroy_key(new); return err; } u32_bind_filter(tp, new, base, tb); err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { u32_unbind_filter(tp, new, tb); if (tb[TCA_U32_LINK]) { struct tc_u_hnode *ht_old; ht_old = rtnl_dereference(n->ht_down); if (ht_old) refcount_inc(&ht_old->refcnt); } __u32_destroy_key(new); return err; } if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; tcf_proto_update_usesw(tp, new->flags); u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); tcf_queue_work(&n->rwork, u32_delete_key_work); return 0; } if (tb[TCA_U32_DIVISOR]) { unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); if (!is_power_of_2(divisor)) { NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2"); return -EINVAL; } if (divisor-- > 0x100) { NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets"); return -EINVAL; } if (TC_U32_KEY(handle)) { NL_SET_ERR_MSG_MOD(extack, "Divisor can only be used on a hash table"); return -EINVAL; } ht = kzalloc(struct_size(ht, ht, divisor + 1), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; if (handle == 0) { handle = gen_new_htid(tp->data, ht); if (handle == 0) { kfree(ht); return -ENOMEM; } } else { err = idr_alloc_u32(&tp_c->handle_idr, ht, &handle, handle, GFP_KERNEL); if (err) { kfree(ht); return err; } } refcount_set(&ht->refcnt, 1); ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; idr_init(&ht->handle_idr); ht->flags = userflags; err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { idr_remove(&tp_c->handle_idr, handle2id(handle)); kfree(ht); return err; } RCU_INIT_POINTER(ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, ht); *arg = ht; return 0; } if (tb[TCA_U32_HASH]) { htid = nla_get_u32(tb[TCA_U32_HASH]); if (TC_U32_HTID(htid) == TC_U32_ROOT) { ht = rtnl_dereference(tp->root); htid = ht->handle; } else { ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); if (!ht) { NL_SET_ERR_MSG_MOD(extack, "Specified hash table not found"); return -EINVAL; } } } else { ht = rtnl_dereference(tp->root); htid = ht->handle; } if (ht->divisor < TC_U32_HASH(htid)) { NL_SET_ERR_MSG_MOD(extack, "Specified hash table buckets exceed configured value"); return -EINVAL; } /* At this point, we need to derive the new handle that will be used to * uniquely map the identity of this table match entry. The * identity of the entry that we need to construct is 32 bits made of: * htid(12b):bucketid(8b):node/entryid(12b) * * At this point _we have the table(ht)_ in which we will insert this * entry. We carry the table's id in variable "htid". * Note that earlier code picked the ht selection either by a) the user * providing the htid specified via TCA_U32_HASH attribute or b) when * no such attribute is passed then the root ht, is default to at ID * 0x[800][00][000]. Rule: the root table has a single bucket with ID 0. * If OTOH the user passed us the htid, they may also pass a bucketid of * choice. 0 is fine. For example a user htid is 0x[600][01][000] it is * indicating hash bucketid of 1. Rule: the entry/node ID _cannot_ be * passed via the htid, so even if it was non-zero it will be ignored. * * We may also have a handle, if the user passed one. The handle also * carries the same addressing of htid(12b):bucketid(8b):node/entryid(12b). * Rule: the bucketid on the handle is ignored even if one was passed; * rather the value on "htid" is always assumed to be the bucketid. */ if (handle) { /* Rule: The htid from handle and tableid from htid must match */ if (TC_U32_HTID(handle) && TC_U32_HTID(handle ^ htid)) { NL_SET_ERR_MSG_MOD(extack, "Handle specified hash table address mismatch"); return -EINVAL; } /* Ok, so far we have a valid htid(12b):bucketid(8b) but we * need to finalize the table entry identification with the last * part - the node/entryid(12b)). Rule: Nodeid _cannot be 0_ for * entries. Rule: nodeid of 0 is reserved only for tables(see * earlier code which processes TC_U32_DIVISOR attribute). * Rule: The nodeid can only be derived from the handle (and not * htid). * Rule: if the handle specified zero for the node id example * 0x60000000, then pick a new nodeid from the pool of IDs * this hash table has been allocating from. * If OTOH it is specified (i.e for example the user passed a * handle such as 0x60000123), then we use it generate our final * handle which is used to uniquely identify the match entry. */ if (!TC_U32_NODE(handle)) { handle = gen_new_kid(ht, htid); } else { handle = htid | TC_U32_NODE(handle); err = idr_alloc_u32(&ht->handle_idr, NULL, &handle, handle, GFP_KERNEL); if (err) return err; } } else { /* The user did not give us a handle; lets just generate one * from the table's pool of nodeids. */ handle = gen_new_kid(ht, htid); } if (tb[TCA_U32_SEL] == NULL) { NL_SET_ERR_MSG_MOD(extack, "Selector not specified"); err = -EINVAL; goto erridr; } s = nla_data(tb[TCA_U32_SEL]); sel_size = struct_size(s, keys, s->nkeys); if (nla_len(tb[TCA_U32_SEL]) < sel_size) { err = -EINVAL; goto erridr; } n = kzalloc(struct_size(n, sel.keys, s->nkeys), GFP_KERNEL); if (n == NULL) { err = -ENOBUFS; goto erridr; } #ifdef CONFIG_CLS_U32_PERF n->pf = __alloc_percpu(struct_size(n->pf, kcnts, s->nkeys), __alignof__(struct tc_u32_pcnt)); if (!n->pf) { err = -ENOBUFS; goto errfree; } #endif unsafe_memcpy(&n->sel, s, sel_size, /* A composite flex-array structure destination, * which was correctly sized with struct_size(), * bounds-checked against nla_len(), and allocated * above. */); RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = userflags; err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) goto errout; #ifdef CONFIG_CLS_U32_MARK n->pcpu_success = alloc_percpu(u32); if (!n->pcpu_success) { err = -ENOMEM; goto errout; } if (tb[TCA_U32_MARK]) { struct tc_u32_mark *mark; mark = nla_data(tb[TCA_U32_MARK]); n->val = mark->val; n->mask = mark->mask; } #endif err = u32_set_parms(net, tp, n, tb, tca[TCA_RATE], flags, n->flags, extack); u32_bind_filter(tp, n, base, tb); if (err == 0) { struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; err = u32_replace_hw_knode(tp, n, flags, extack); if (err) goto errunbind; if (!tc_in_hw(n->flags)) n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; tcf_proto_update_usesw(tp, n->flags); ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) if (TC_U32_NODE(handle) < TC_U32_NODE(pins->handle)) break; RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); tp_c->knodes++; *arg = n; return 0; } errunbind: u32_unbind_filter(tp, n, tb); #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); #endif errout: tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF errfree: free_percpu(n->pf); #endif kfree(n); erridr: idr_remove(&ht->handle_idr, handle); return err; } static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; unsigned int h; if (arg->stop) return; for (ht = rtnl_dereference(tp_c->hlist); ht; ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; if (!tc_cls_stats_dump(tp, arg, ht)) return; for (h = 0; h <= ht->divisor; h++) { for (n = rtnl_dereference(ht->ht[h]); n; n = rtnl_dereference(n->next)) { if (!tc_cls_stats_dump(tp, arg, n)) return; } } } } static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_cls_u32_offload cls_u32 = {}; int err; tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack); cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE; cls_u32.hnode.divisor = ht->divisor; cls_u32.hnode.handle = ht->handle; cls_u32.hnode.prio = ht->prio; err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); if (err && add && tc_skip_sw(ht->flags)) return err; return 0; } static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); struct tcf_block *block = tp->chain->block; struct tc_cls_u32_offload cls_u32 = {}; tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); cls_u32.command = add ? TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE; cls_u32.knode.handle = n->handle; if (add) { cls_u32.knode.fshift = n->fshift; #ifdef CONFIG_CLS_U32_MARK cls_u32.knode.val = n->val; cls_u32.knode.mask = n->mask; #else cls_u32.knode.val = 0; cls_u32.knode.mask = 0; #endif cls_u32.knode.sel = &n->sel; cls_u32.knode.res = &n->res; cls_u32.knode.exts = &n->exts; if (n->ht_down) cls_u32.knode.link_handle = ht->handle; } return tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSU32, &cls_u32, cb_priv, &n->flags, &n->in_hw_count); } static int u32_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; unsigned int h; int err; for (ht = rtnl_dereference(tp_c->hlist); ht; ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; /* When adding filters to a new dev, try to offload the * hashtable first. When removing, do the filters before the * hashtable. */ if (add && !tc_skip_hw(ht->flags)) { err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); if (err) return err; } for (h = 0; h <= ht->divisor; h++) { for (n = rtnl_dereference(ht->ht[h]); n; n = rtnl_dereference(n->next)) { if (tc_skip_hw(n->flags)) continue; err = u32_reoffload_knode(tp, n, add, cb, cb_priv, extack); if (err) return err; } } if (!add && !tc_skip_hw(ht->flags)) u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); } return 0; } static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct tc_u_knode *n = fh; tc_cls_bind_class(classid, cl, q, &n->res, base); } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct tc_u_knode *n = fh; struct tc_u_hnode *ht_up, *ht_down; struct nlattr *nest; if (n == NULL) return skb->len; t->tcm_handle = n->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (TC_U32_KEY(n->handle) == 0) { struct tc_u_hnode *ht = fh; u32 divisor = ht->divisor + 1; if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor)) goto nla_put_failure; } else { #ifdef CONFIG_CLS_U32_PERF struct tc_u32_pcnt *gpf; int cpu; #endif if (nla_put(skb, TCA_U32_SEL, struct_size(&n->sel, keys, n->sel.nkeys), &n->sel)) goto nla_put_failure; ht_up = rtnl_dereference(n->ht_up); if (ht_up) { u32 htid = n->handle & 0xFFFFF000; if (nla_put_u32(skb, TCA_U32_HASH, htid)) goto nla_put_failure; } if (n->res.classid && nla_put_u32(skb, TCA_U32_CLASSID, n->res.classid)) goto nla_put_failure; ht_down = rtnl_dereference(n->ht_down); if (ht_down && nla_put_u32(skb, TCA_U32_LINK, ht_down->handle)) goto nla_put_failure; if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags)) goto nla_put_failure; #ifdef CONFIG_CLS_U32_MARK if ((n->val || n->mask)) { struct tc_u32_mark mark = {.val = n->val, .mask = n->mask, .success = 0}; int cpum; for_each_possible_cpu(cpum) { __u32 cnt = *per_cpu_ptr(n->pcpu_success, cpum); mark.success += cnt; } if (nla_put(skb, TCA_U32_MARK, sizeof(mark), &mark)) goto nla_put_failure; } #endif if (tcf_exts_dump(skb, &n->exts) < 0) goto nla_put_failure; if (n->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, n->ifindex); if (dev && nla_put_string(skb, TCA_U32_INDEV, dev->name)) goto nla_put_failure; } #ifdef CONFIG_CLS_U32_PERF gpf = kzalloc(struct_size(gpf, kcnts, n->sel.nkeys), GFP_KERNEL); if (!gpf) goto nla_put_failure; for_each_possible_cpu(cpu) { int i; struct tc_u32_pcnt *pf = per_cpu_ptr(n->pf, cpu); gpf->rcnt += pf->rcnt; gpf->rhit += pf->rhit; for (i = 0; i < n->sel.nkeys; i++) gpf->kcnts[i] += pf->kcnts[i]; } if (nla_put_64bit(skb, TCA_U32_PCNT, struct_size(gpf, kcnts, n->sel.nkeys), gpf, TCA_U32_PAD)) { kfree(gpf); goto nla_put_failure; } kfree(gpf); #endif } nla_nest_end(skb, nest); if (TC_U32_KEY(n->handle)) if (tcf_exts_dump_stats(skb, &n->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct tcf_proto_ops cls_u32_ops __read_mostly = { .kind = "u32", .classify = u32_classify, .init = u32_init, .destroy = u32_destroy, .get = u32_get, .change = u32_change, .delete = u32_delete, .walk = u32_walk, .reoffload = u32_reoffload, .dump = u32_dump, .bind_class = u32_bind_class, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_CLS("u32"); static int __init init_u32(void) { int i, ret; pr_info("u32 classifier\n"); #ifdef CONFIG_CLS_U32_PERF pr_info(" Performance counters on\n"); #endif pr_info(" input device check on\n"); #ifdef CONFIG_NET_CLS_ACT pr_info(" Actions configured\n"); #endif tc_u_common_hash = kvmalloc_array(U32_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!tc_u_common_hash) return -ENOMEM; for (i = 0; i < U32_HASH_SIZE; i++) INIT_HLIST_HEAD(&tc_u_common_hash[i]); ret = register_tcf_proto_ops(&cls_u32_ops); if (ret) kvfree(tc_u_common_hash); return ret; } static void __exit exit_u32(void) { unregister_tcf_proto_ops(&cls_u32_ops); kvfree(tc_u_common_hash); } module_init(init_u32) module_exit(exit_u32) MODULE_DESCRIPTION("Universal 32bit based TC Classifier"); MODULE_LICENSE("GPL");
40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_timestamp.h> static bool nf_ct_tstamp __read_mostly; module_param_named(tstamp, nf_ct_tstamp, bool, 0644); MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); void nf_conntrack_tstamp_pernet_init(struct net *net) { net->ct.sysctl_tstamp = nf_ct_tstamp; }
1 1 1 1 1 1 1 4 1 1 1 2 1 4 2 4 4 4 4 1 1 3 1 4 4 2 1 1 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 // SPDX-License-Identifier: GPL-2.0-only /* * Overlayfs NFS export support. * * Amir Goldstein <amir73il@gmail.com> * * Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved. */ #include <linux/fs.h> #include <linux/cred.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/ratelimit.h> #include "overlayfs.h" static int ovl_encode_maybe_copy_up(struct dentry *dentry) { int err; if (ovl_dentry_upper(dentry)) return 0; err = ovl_copy_up(dentry); if (err) { pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); } return err; } /* * Before encoding a non-upper directory file handle from real layer N, we need * to check if it will be possible to reconnect an overlay dentry from the real * lower decoded dentry. This is done by following the overlay ancestry up to a * "layer N connected" ancestor and verifying that all parents along the way are * "layer N connectable". If an ancestor that is NOT "layer N connectable" is * found, we need to copy up an ancestor, which is "layer N connectable", thus * making that ancestor "layer N connected". For example: * * layer 1: /a * layer 2: /a/b/c * * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is * copied up and renamed, upper dir /a will be indexed by lower dir /a from * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*) * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay * dentry from the connected lower dentry /a/b/c. * * To avoid this problem on decode time, we need to copy up an ancestor of * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected" * and when the time comes to decode the file handle from lower dentry /a/b/c, * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding * a connected overlay dentry will be accomplished. * * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an * entry /a in the lower layers above layer N and find the indexed dir /a from * layer 1. If that improvement is made, then the check for "layer N connected" * will need to verify there are no redirects in lower layers above N. In the * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable": * * layer 1: /A (redirect = /a) * layer 2: /a/b/c */ /* Return the lowest layer for encoding a connectable file handle */ static int ovl_connectable_layer(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); /* We can get overlay root from root of any layer */ if (dentry == dentry->d_sb->s_root) return ovl_numlower(oe); /* * If it's an unindexed merge dir, then it's not connectable with any * lower layer */ if (ovl_dentry_upper(dentry) && !ovl_test_flag(OVL_INDEX, d_inode(dentry))) return 0; /* We can get upper/overlay path from indexed/lower dentry */ return ovl_lowerstack(oe)->layer->idx; } /* * @dentry is "connected" if all ancestors up to root or a "connected" ancestor * have the same uppermost lower layer as the origin's layer. We may need to * copy up a "connectable" ancestor to make it "connected". A "connected" dentry * cannot become non "connected", so cache positive result in dentry flags. * * Return the connected origin layer or < 0 on error. */ static int ovl_connect_layer(struct dentry *dentry) { struct dentry *next, *parent = NULL; struct ovl_entry *oe = OVL_E(dentry); int origin_layer; int err = 0; if (WARN_ON(dentry == dentry->d_sb->s_root) || WARN_ON(!ovl_dentry_lower(dentry))) return -EIO; origin_layer = ovl_lowerstack(oe)->layer->idx; if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) return origin_layer; /* Find the topmost origin layer connectable ancestor of @dentry */ next = dget(dentry); for (;;) { parent = dget_parent(next); if (WARN_ON(parent == next)) { err = -EIO; break; } /* * If @parent is not origin layer connectable, then copy up * @next which is origin layer connectable and we are done. */ if (ovl_connectable_layer(parent) < origin_layer) { err = ovl_encode_maybe_copy_up(next); break; } /* If @parent is connected or indexed we are done */ if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) || ovl_test_flag(OVL_INDEX, d_inode(parent))) break; dput(next); next = parent; } dput(parent); dput(next); if (!err) ovl_dentry_set_flag(OVL_E_CONNECTED, dentry); return err ?: origin_layer; } /* * We only need to encode origin if there is a chance that the same object was * encoded pre copy up and then we need to stay consistent with the same * encoding also after copy up. If non-pure upper is not indexed, then it was * copied up before NFS export was enabled. In that case we don't need to worry * about staying consistent with pre copy up encoding and we encode an upper * file handle. Overlay root dentry is a private case of non-indexed upper. * * The following table summarizes the different file handle encodings used for * different overlay object types: * * Object type | Encoding * -------------------------------- * Pure upper | U * Non-indexed upper | U * Indexed upper | L (*) * Non-upper | L (*) * * U = upper file handle * L = lower file handle * * (*) Decoding a connected overlay dir from real lower dentry is not always * possible when there are redirects in lower layers and non-indexed merge dirs. * To mitigate those case, we may copy up the lower dir ancestor before encode * of a decodable file handle for non-upper dir. * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ static int ovl_check_encode_origin(struct inode *inode) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); bool decodable = ofs->config.nfs_export; struct dentry *dentry; int err; /* No upper layer? */ if (!ovl_upper_mnt(ofs)) return 1; /* Lower file handle for non-upper non-decodable */ if (!ovl_inode_upper(inode) && !decodable) return 1; /* Upper file handle for pure upper */ if (!ovl_inode_lower(inode)) return 0; /* * Root is never indexed, so if there's an upper layer, encode upper for * root. */ if (inode == d_inode(inode->i_sb->s_root)) return 0; /* * Upper decodable file handle for non-indexed upper. */ if (ovl_inode_upper(inode) && decodable && !ovl_test_flag(OVL_INDEX, inode)) return 0; /* * Decoding a merge dir, whose origin's ancestor is under a redirected * lower dir or under a non-indexed upper is not always possible. * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ if (!decodable || !S_ISDIR(inode->i_mode)) return 1; dentry = d_find_any_alias(inode); if (!dentry) return -ENOENT; err = ovl_connect_layer(dentry); dput(dentry); if (err < 0) return err; /* Lower file handle for indexed and non-upper dir/non-dir */ return 1; } static int ovl_dentry_to_fid(struct ovl_fs *ofs, struct inode *inode, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; int len; /* * Check if we should encode a lower or upper file handle and maybe * copy up an ancestor to make lower file handle connectable. */ err = enc_lower = ovl_check_encode_origin(inode); if (enc_lower < 0) goto fail; /* Encode an upper or lower file handle */ fh = ovl_encode_real_fh(ofs, enc_lower ? ovl_inode_lower(inode) : ovl_inode_upper(inode), !enc_lower); if (IS_ERR(fh)) return PTR_ERR(fh); len = OVL_FH_LEN(fh); if (len <= buflen) memcpy(fid, fh, len); err = len; out: kfree(fh); return err; fail: pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", inode->i_ino, err); goto out; } static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); int bytes, buflen = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) return FILEID_INVALID; bytes = ovl_dentry_to_fid(ofs, inode, fid, buflen); if (bytes <= 0) return FILEID_INVALID; *max_len = bytes >> 2; if (bytes > buflen) return FILEID_INVALID; return OVL_FILEID_V1; } /* * Find or instantiate an overlay dentry from real dentries and index. */ static struct dentry *ovl_obtain_alias(struct super_block *sb, struct dentry *upper_alias, struct ovl_path *lowerpath, struct dentry *index) { struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; struct dentry *upper = upper_alias ?: index; struct inode *inode = NULL; struct ovl_entry *oe; struct ovl_inode_params oip = { .index = index, }; /* We get overlay directory dentries with ovl_lookup_real() */ if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); oe = ovl_alloc_entry(!!lower); if (!oe) return ERR_PTR(-ENOMEM); oip.upperdentry = dget(upper); if (lower) { ovl_lowerstack(oe)->dentry = dget(lower); ovl_lowerstack(oe)->layer = lowerpath->layer; } oip.oe = oe; inode = ovl_get_inode(sb, &oip); if (IS_ERR(inode)) { ovl_free_entry(oe); dput(upper); return ERR_CAST(inode); } if (upper) ovl_set_flag(OVL_UPPERDATA, inode); return d_obtain_alias(inode); } /* Get the upper or lower dentry in stack whose on layer @idx */ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); int i; if (!idx) return ovl_dentry_upper(dentry); for (i = 0; i < ovl_numlower(oe); i++) { if (lowerstack[i].layer->idx == idx) return lowerstack[i].dentry; } return NULL; } /* * Lookup a child overlay dentry to get a connected overlay dentry whose real * dentry is @real. If @real is on upper layer, we lookup a child overlay * dentry with the same name as the real dentry. Otherwise, we need to consult * index for lookup. */ static struct dentry *ovl_lookup_real_one(struct dentry *connected, struct dentry *real, const struct ovl_layer *layer) { struct inode *dir = d_inode(connected); struct dentry *this, *parent = NULL; struct name_snapshot name; int err; /* * Lookup child overlay dentry by real name. The dir mutex protects us * from racing with overlay rename. If the overlay dentry that is above * real has already been moved to a parent that is not under the * connected overlay dir, we return -ECHILD and restart the lookup of * connected real path from the top. */ inode_lock_nested(dir, I_MUTEX_PARENT); err = -ECHILD; parent = dget_parent(real); if (ovl_dentry_real_at(connected, layer->idx) != parent) goto fail; /* * We also need to take a snapshot of real dentry name to protect us * from racing with underlying layer rename. In this case, we don't * care about returning ESTALE, only from dereferencing a free name * pointer because we hold no lock on the real dentry. */ take_dentry_name_snapshot(&name, real); /* * No idmap handling here: it's an internal lookup. Could skip * permission checking altogether, but for now just use non-idmap * transformed ids. */ this = lookup_one_len(name.name.name, connected, name.name.len); release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { goto fail; } else if (!this || !this->d_inode) { dput(this); err = -ENOENT; goto fail; } else if (ovl_dentry_real_at(this, layer->idx) != real) { dput(this); err = -ESTALE; goto fail; } out: dput(parent); inode_unlock(dir); return this; fail: pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); this = ERR_PTR(err); goto out; } static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer); /* * Lookup an indexed or hashed overlay dentry by real inode. */ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { struct ovl_fs *ofs = OVL_FS(sb); struct dentry *index = NULL; struct dentry *this = NULL; struct inode *inode; /* * Decoding upper dir from index is expensive, so first try to lookup * overlay dentry in inode/dcache. */ inode = ovl_lookup_inode(sb, real, !layer->idx); if (IS_ERR(inode)) return ERR_CAST(inode); if (inode) { this = d_find_any_alias(inode); iput(inode); } /* * For decoded lower dir file handle, lookup index by origin to check * if lower dir was copied up and and/or removed. */ if (!this && layer->idx && ovl_indexdir(sb) && !WARN_ON(!d_is_dir(real))) { index = ovl_lookup_index(ofs, NULL, real, false); if (IS_ERR(index)) return index; } /* Get connected upper overlay dir from index */ if (index) { struct dentry *upper = ovl_index_upper(ofs, index, true); dput(index); if (IS_ERR_OR_NULL(upper)) return upper; /* * ovl_lookup_real() in lower layer may call recursively once to * ovl_lookup_real() in upper layer. The first level call walks * back lower parents to the topmost indexed parent. The second * recursive call walks back from indexed upper to the topmost * connected/hashed upper parent (or up to root). */ this = ovl_lookup_real(sb, upper, &ofs->layers[0]); dput(upper); } if (IS_ERR_OR_NULL(this)) return this; if (ovl_dentry_real_at(this, layer->idx) != real) { dput(this); this = ERR_PTR(-EIO); } return this; } /* * Lookup an indexed or hashed overlay dentry, whose real dentry is an * ancestor of @real. */ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { struct dentry *next, *parent = NULL; struct dentry *ancestor = ERR_PTR(-EIO); if (real == layer->mnt->mnt_root) return dget(sb->s_root); /* Find the topmost indexed or hashed ancestor */ next = dget(real); for (;;) { parent = dget_parent(next); /* * Lookup a matching overlay dentry in inode/dentry * cache or in index by real inode. */ ancestor = ovl_lookup_real_inode(sb, next, layer); if (ancestor) break; if (parent == layer->mnt->mnt_root) { ancestor = dget(sb->s_root); break; } /* * If @real has been moved out of the layer root directory, * we will eventully hit the real fs root. This cannot happen * by legit overlay rename, so we return error in that case. */ if (parent == next) { ancestor = ERR_PTR(-EXDEV); break; } dput(next); next = parent; } dput(parent); dput(next); return ancestor; } /* * Lookup a connected overlay dentry whose real dentry is @real. * If @real is on upper layer, we lookup a child overlay dentry with the same * path the real dentry. Otherwise, we need to consult index for lookup. */ static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { struct dentry *connected; int err = 0; connected = ovl_lookup_real_ancestor(sb, real, layer); if (IS_ERR(connected)) return connected; while (!err) { struct dentry *next, *this; struct dentry *parent = NULL; struct dentry *real_connected = ovl_dentry_real_at(connected, layer->idx); if (real_connected == real) break; /* Find the topmost dentry not yet connected */ next = dget(real); for (;;) { parent = dget_parent(next); if (parent == real_connected) break; /* * If real has been moved out of 'real_connected', * we will not find 'real_connected' and hit the layer * root. In that case, we need to restart connecting. * This game can go on forever in the worst case. We * may want to consider taking s_vfs_rename_mutex if * this happens more than once. */ if (parent == layer->mnt->mnt_root) { dput(connected); connected = dget(sb->s_root); break; } /* * If real file has been moved out of the layer root * directory, we will eventully hit the real fs root. * This cannot happen by legit overlay rename, so we * return error in that case. */ if (parent == next) { err = -EXDEV; break; } dput(next); next = parent; } if (!err) { this = ovl_lookup_real_one(connected, next, layer); if (IS_ERR(this)) err = PTR_ERR(this); /* * Lookup of child in overlay can fail when racing with * overlay rename of child away from 'connected' parent. * In this case, we need to restart the lookup from the * top, because we cannot trust that 'real_connected' is * still an ancestor of 'real'. There is a good chance * that the renamed overlay ancestor is now in cache, so * ovl_lookup_real_ancestor() will find it and we can * continue to connect exactly from where lookup failed. */ if (err == -ECHILD) { this = ovl_lookup_real_ancestor(sb, real, layer); err = PTR_ERR_OR_ZERO(this); } if (!err) { dput(connected); connected = this; } } dput(parent); dput(next); } if (err) goto fail; return connected; fail: pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); dput(connected); return ERR_PTR(err); } /* * Get an overlay dentry from upper/lower real dentries and index. */ static struct dentry *ovl_get_dentry(struct super_block *sb, struct dentry *upper, struct ovl_path *lowerpath, struct dentry *index) { struct ovl_fs *ofs = OVL_FS(sb); const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; struct dentry *real = upper ?: (index ?: lowerpath->dentry); /* * Obtain a disconnected overlay dentry from a non-dir real dentry * and index. */ if (!d_is_dir(real)) return ovl_obtain_alias(sb, upper, lowerpath, index); /* Removed empty directory? */ if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real)) return ERR_PTR(-ENOENT); /* * If real dentry is connected and hashed, get a connected overlay * dentry whose real dentry is @real. */ return ovl_lookup_real(sb, real, layer); } static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { struct ovl_fs *ofs = OVL_FS(sb); struct dentry *dentry; struct dentry *upper; if (!ovl_upper_mnt(ofs)) return ERR_PTR(-EACCES); upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true); if (IS_ERR_OR_NULL(upper)) return upper; dentry = ovl_get_dentry(sb, upper, NULL, NULL); dput(upper); return dentry; } static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path origin = { }; struct ovl_path *stack = &origin; struct dentry *dentry = NULL; struct dentry *index = NULL; struct inode *inode; int err; /* First lookup overlay inode in inode cache by origin fh */ err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack); if (err) return ERR_PTR(err); if (!d_is_dir(origin.dentry) || !(origin.dentry->d_flags & DCACHE_DISCONNECTED)) { inode = ovl_lookup_inode(sb, origin.dentry, false); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_err; if (inode) { dentry = d_find_any_alias(inode); iput(inode); if (dentry) goto out; } } /* Then lookup indexed upper/whiteout by origin fh */ if (ovl_indexdir(sb)) { index = ovl_get_index_fh(ofs, fh); err = PTR_ERR(index); if (IS_ERR(index)) { index = NULL; goto out_err; } } /* Then try to get a connected upper dir by index */ if (index && d_is_dir(index)) { struct dentry *upper = ovl_index_upper(ofs, index, true); err = PTR_ERR(upper); if (IS_ERR_OR_NULL(upper)) goto out_err; dentry = ovl_get_dentry(sb, upper, NULL, NULL); dput(upper); goto out; } /* Find origin.dentry again with ovl_acceptable() layer check */ if (d_is_dir(origin.dentry)) { dput(origin.dentry); origin.dentry = NULL; err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); if (err) goto out_err; } if (index) { err = ovl_verify_origin(ofs, index, origin.dentry, false); if (err) goto out_err; } /* Get a connected non-upper dir or disconnected non-dir */ dentry = ovl_get_dentry(sb, NULL, &origin, index); out: dput(origin.dentry); dput(index); return dentry; out_err: dentry = ERR_PTR(err); goto out; } static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) { struct ovl_fh *fh; /* If on-wire inner fid is aligned - nothing to do */ if (fh_type == OVL_FILEID_V1) return (struct ovl_fh *)fid; if (fh_type != OVL_FILEID_V0) return ERR_PTR(-EINVAL); if (buflen <= OVL_FH_WIRE_OFFSET) return ERR_PTR(-EINVAL); fh = kzalloc(buflen, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); /* Copy unaligned inner fh into aligned buffer */ memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET); return fh; } static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct dentry *dentry = NULL; struct ovl_fh *fh = NULL; int len = fh_len << 2; unsigned int flags = 0; int err; fh = ovl_fid_to_fh(fid, len, fh_type); err = PTR_ERR(fh); if (IS_ERR(fh)) goto out_err; err = ovl_check_fh_len(fh, len); if (err) goto out_err; flags = fh->fb.flags; dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? ovl_upper_fh_to_d(sb, fh) : ovl_lower_fh_to_d(sb, fh); err = PTR_ERR(dentry); if (IS_ERR(dentry) && err != -ESTALE) goto out_err; out: /* We may have needed to re-align OVL_FILEID_V0 */ if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) kfree(fh); return dentry; out_err: pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", fh_len, fh_type, flags, err); dentry = ERR_PTR(err); goto out; } static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); return ERR_PTR(-EACCES); } static int ovl_get_name(struct dentry *parent, char *name, struct dentry *child) { /* * ovl_fh_to_dentry() returns connected dir overlay dentries and * ovl_fh_to_parent() is not implemented, so we should not get here. */ WARN_ON_ONCE(1); return -EIO; } static struct dentry *ovl_get_parent(struct dentry *dentry) { /* * ovl_fh_to_dentry() returns connected dir overlay dentries, so we * should not get here. */ WARN_ON_ONCE(1); return ERR_PTR(-EIO); } const struct export_operations ovl_export_operations = { .encode_fh = ovl_encode_fh, .fh_to_dentry = ovl_fh_to_dentry, .fh_to_parent = ovl_fh_to_parent, .get_name = ovl_get_name, .get_parent = ovl_get_parent, }; /* encode_fh() encodes non-decodable file handles with nfs_export=off */ const struct export_operations ovl_export_fid_operations = { .encode_fh = ovl_encode_fh, };
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. * * Kernel side components to support tools/testing/selftests/iommu */ #include <linux/anon_inodes.h> #include <linux/debugfs.h> #include <linux/fault-inject.h> #include <linux/file.h> #include <linux/iommu.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/xarray.h> #include <uapi/linux/iommufd.h> #include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h" #include "iommufd_test.h" static DECLARE_FAULT_ATTR(fail_iommufd); static struct dentry *dbgfs_root; static struct platform_device *selftest_iommu_dev; static const struct iommu_ops mock_ops; static struct iommu_domain_ops domain_nested_ops; size_t iommufd_test_memory_limit = 65536; struct mock_bus_type { struct bus_type bus; struct notifier_block nb; }; static struct mock_bus_type iommufd_mock_bus_type = { .bus = { .name = "iommufd_mock", }, }; static DEFINE_IDA(mock_dev_ida); enum { MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, /* * Like a real page table alignment requires the low bits of the address * to be zero. xarray also requires the high bit to be zero, so we store * the pfns shifted. The upper bits are used for metadata. */ MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, _MOCK_PFN_START = MOCK_PFN_MASK + 1, MOCK_PFN_START_IOVA = _MOCK_PFN_START, MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, }; /* * Syzkaller has trouble randomizing the correct iova to use since it is linked * to the map ioctl's output, and it has no ide about that. So, simplify things. * In syzkaller mode the 64 bit IOVA is converted into an nth area and offset * value. This has a much smaller randomization space and syzkaller can hit it. */ static unsigned long __iommufd_test_syz_conv_iova(struct io_pagetable *iopt, u64 *iova) { struct syz_layout { __u32 nth_area; __u32 offset; }; struct syz_layout *syz = (void *)iova; unsigned int nth = syz->nth_area; struct iopt_area *area; down_read(&iopt->iova_rwsem); for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; area = iopt_area_iter_next(area, 0, ULONG_MAX)) { if (nth == 0) { up_read(&iopt->iova_rwsem); return iopt_area_iova(area) + syz->offset; } nth--; } up_read(&iopt->iova_rwsem); return 0; } static unsigned long iommufd_test_syz_conv_iova(struct iommufd_access *access, u64 *iova) { unsigned long ret; mutex_lock(&access->ioas_lock); if (!access->ioas) { mutex_unlock(&access->ioas_lock); return 0; } ret = __iommufd_test_syz_conv_iova(&access->ioas->iopt, iova); mutex_unlock(&access->ioas_lock); return ret; } void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags) { struct iommufd_ioas *ioas; if (!(*flags & MOCK_FLAGS_ACCESS_SYZ)) return; *flags &= ~(u32)MOCK_FLAGS_ACCESS_SYZ; ioas = iommufd_get_ioas(ucmd->ictx, ioas_id); if (IS_ERR(ioas)) return; *iova = __iommufd_test_syz_conv_iova(&ioas->iopt, iova); iommufd_put_object(ucmd->ictx, &ioas->obj); } struct mock_iommu_domain { unsigned long flags; struct iommu_domain domain; struct xarray pfns; }; static inline struct mock_iommu_domain * to_mock_domain(struct iommu_domain *domain) { return container_of(domain, struct mock_iommu_domain, domain); } struct mock_iommu_domain_nested { struct iommu_domain domain; struct mock_viommu *mock_viommu; struct mock_iommu_domain *parent; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; static inline struct mock_iommu_domain_nested * to_mock_nested(struct iommu_domain *domain) { return container_of(domain, struct mock_iommu_domain_nested, domain); } struct mock_viommu { struct iommufd_viommu core; struct mock_iommu_domain *s2_parent; }; static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) { return container_of(viommu, struct mock_viommu, core); } enum selftest_obj_type { TYPE_IDEV, }; struct mock_dev { struct device dev; unsigned long flags; int id; u32 cache[MOCK_DEV_CACHE_NUM]; }; static inline struct mock_dev *to_mock_dev(struct device *dev) { return container_of(dev, struct mock_dev, dev); } struct selftest_obj { struct iommufd_object obj; enum selftest_obj_type type; union { struct { struct iommufd_device *idev; struct iommufd_ctx *ictx; struct mock_dev *mock_dev; } idev; }; }; static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) { return container_of(obj, struct selftest_obj, obj); } static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { struct mock_dev *mdev = to_mock_dev(dev); if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; return 0; } static const struct iommu_domain_ops mock_blocking_ops = { .attach_dev = mock_domain_nop_attach, }; static struct iommu_domain mock_blocking_domain = { .type = IOMMU_DOMAIN_BLOCKED, .ops = &mock_blocking_ops, }; static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) { struct iommu_test_hw_info *info; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); info->test_reg = IOMMU_HW_INFO_SELFTEST_REGVAL; *length = sizeof(*info); *type = IOMMU_HW_INFO_TYPE_SELFTEST; return info; } static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, bool enable) { struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = mock->flags; if (enable && !domain->dirty_ops) return -EINVAL; /* No change? */ if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK))) return 0; flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK); mock->flags = flags; return 0; } static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, unsigned long iova, size_t page_size, unsigned long flags) { unsigned long cur, end = iova + page_size - 1; bool dirty = false; void *ent, *old; for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) continue; dirty = true; /* Clear dirty */ if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { unsigned long val; val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, xa_mk_value(val), GFP_KERNEL); WARN_ON_ONCE(ent != old); } } return dirty; } static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, unsigned long iova, size_t size, unsigned long flags, struct iommu_dirty_bitmap *dirty) { struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long end = iova + size; void *ent; if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) return -EINVAL; do { unsigned long pgsize = MOCK_IO_PAGE_SIZE; unsigned long head; ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); if (!ent) { iova += pgsize; continue; } if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) pgsize = MOCK_HUGE_PAGE_SIZE; head = iova & ~(pgsize - 1); /* Clear dirty */ if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) iommu_dirty_bitmap_record(dirty, iova, pgsize); iova += pgsize; } while (iova < end); return 0; } static const struct iommu_dirty_ops dirty_ops = { .set_dirty_tracking = mock_domain_set_dirty_tracking, .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; static struct mock_iommu_domain_nested * __mock_domain_alloc_nested(const struct iommu_user_data *user_data) { struct mock_iommu_domain_nested *mock_nested; struct iommu_hwpt_selftest user_cfg; int rc, i; if (user_data->type != IOMMU_HWPT_DATA_SELFTEST) return ERR_PTR(-EOPNOTSUPP); rc = iommu_copy_struct_from_user(&user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb); if (rc) return ERR_PTR(rc); mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); if (!mock_nested) return ERR_PTR(-ENOMEM); mock_nested->domain.ops = &domain_nested_ops; mock_nested->domain.type = IOMMU_DOMAIN_NESTED; for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) mock_nested->iotlb[i] = user_cfg.iotlb; return mock_nested; } static struct iommu_domain * mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, u32 flags, const struct iommu_user_data *user_data) { struct mock_iommu_domain_nested *mock_nested; struct mock_iommu_domain *mock_parent; if (flags) return ERR_PTR(-EOPNOTSUPP); if (!parent || parent->ops != mock_ops.default_domain_ops) return ERR_PTR(-EINVAL); mock_parent = to_mock_domain(parent); if (!mock_parent) return ERR_PTR(-EINVAL); mock_nested = __mock_domain_alloc_nested(user_data); if (IS_ERR(mock_nested)) return ERR_CAST(mock_nested); mock_nested->parent = mock_parent; return &mock_nested->domain; } static struct iommu_domain * mock_domain_alloc_paging_flags(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT; struct mock_dev *mdev = to_mock_dev(dev); bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; struct mock_iommu_domain *mock; if (user_data) return ERR_PTR(-EOPNOTSUPP); if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) return ERR_PTR(-EOPNOTSUPP); mock = kzalloc(sizeof(*mock), GFP_KERNEL); if (!mock) return ERR_PTR(-ENOMEM); mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; mock->domain.ops = mock_ops.default_domain_ops; mock->domain.type = IOMMU_DOMAIN_UNMANAGED; xa_init(&mock->pfns); if (has_dirty_flag) mock->domain.dirty_ops = &dirty_ops; return &mock->domain; } static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = to_mock_domain(domain); WARN_ON(!xa_empty(&mock->pfns)); kfree(mock); } static int mock_domain_map_pages(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) { struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = MOCK_PFN_START_IOVA; unsigned long start_iova = iova; /* * xarray does not reliably work with fault injection because it does a * retry allocation, so put our own failure point. */ if (iommufd_should_fail()) return -ENOENT; WARN_ON(iova % MOCK_IO_PAGE_SIZE); WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); for (; pgcount; pgcount--) { size_t cur; for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { void *old; if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) flags = MOCK_PFN_LAST_IOVA; if (pgsize != MOCK_IO_PAGE_SIZE) { flags |= MOCK_PFN_HUGE_IOVA; } old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | flags), gfp); if (xa_is_err(old)) { for (; start_iova != iova; start_iova += MOCK_IO_PAGE_SIZE) xa_erase(&mock->pfns, start_iova / MOCK_IO_PAGE_SIZE); return xa_err(old); } WARN_ON(old); iova += MOCK_IO_PAGE_SIZE; paddr += MOCK_IO_PAGE_SIZE; *mapped += MOCK_IO_PAGE_SIZE; flags = 0; } } return 0; } static size_t mock_domain_unmap_pages(struct iommu_domain *domain, unsigned long iova, size_t pgsize, size_t pgcount, struct iommu_iotlb_gather *iotlb_gather) { struct mock_iommu_domain *mock = to_mock_domain(domain); bool first = true; size_t ret = 0; void *ent; WARN_ON(iova % MOCK_IO_PAGE_SIZE); WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); for (; pgcount; pgcount--) { size_t cur; for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); /* * iommufd generates unmaps that must be a strict * superset of the map's performend So every * starting/ending IOVA should have been an iova passed * to map. * * This simple logic doesn't work when the HUGE_PAGE is * turned on since the core code will automatically * switch between the two page sizes creating a break in * the unmap calls. The break can land in the middle of * contiguous IOVA. */ if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { if (first) { WARN_ON(ent && !(xa_to_value(ent) & MOCK_PFN_START_IOVA)); first = false; } if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) WARN_ON(ent && !(xa_to_value(ent) & MOCK_PFN_LAST_IOVA)); } iova += MOCK_IO_PAGE_SIZE; ret += MOCK_IO_PAGE_SIZE; } } return ret; } static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { struct mock_iommu_domain *mock = to_mock_domain(domain); void *ent; WARN_ON(iova % MOCK_IO_PAGE_SIZE); ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); WARN_ON(!ent); return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; } static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { struct mock_dev *mdev = to_mock_dev(dev); switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: return true; case IOMMU_CAP_DIRTY_TRACKING: return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY); default: break; } return false; } static struct iopf_queue *mock_iommu_iopf_queue; static struct mock_iommu_device { struct iommu_device iommu_dev; struct completion complete; refcount_t users; } mock_iommu; static struct iommu_device *mock_probe_device(struct device *dev) { if (dev->bus != &iommufd_mock_bus_type.bus) return ERR_PTR(-ENODEV); return &mock_iommu.iommu_dev; } static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt, struct iommu_page_response *msg) { } static int mock_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) { if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue) return -ENODEV; return iopf_queue_add_device(mock_iommu_iopf_queue, dev); } static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) { if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue) return -ENODEV; iopf_queue_remove_device(mock_iommu_iopf_queue, dev); return 0; } static void mock_viommu_destroy(struct iommufd_viommu *viommu) { struct mock_iommu_device *mock_iommu = container_of( viommu->iommu_dev, struct mock_iommu_device, iommu_dev); if (refcount_dec_and_test(&mock_iommu->users)) complete(&mock_iommu->complete); /* iommufd core frees mock_viommu and viommu */ } static struct iommu_domain * mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data) { struct mock_viommu *mock_viommu = to_mock_viommu(viommu); struct mock_iommu_domain_nested *mock_nested; if (flags) return ERR_PTR(-EOPNOTSUPP); mock_nested = __mock_domain_alloc_nested(user_data); if (IS_ERR(mock_nested)) return ERR_CAST(mock_nested); mock_nested->mock_viommu = mock_viommu; mock_nested->parent = mock_viommu->s2_parent; return &mock_nested->domain; } static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu, struct iommu_user_data_array *array) { struct iommu_viommu_invalidate_selftest *cmds; struct iommu_viommu_invalidate_selftest *cur; struct iommu_viommu_invalidate_selftest *end; int rc; /* A zero-length array is allowed to validate the array type */ if (array->entry_num == 0 && array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) { array->entry_num = 0; return 0; } cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); if (!cmds) return -ENOMEM; cur = cmds; end = cmds + array->entry_num; static_assert(sizeof(*cmds) == 3 * sizeof(u32)); rc = iommu_copy_struct_from_full_user_array( cmds, sizeof(*cmds), array, IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST); if (rc) goto out; while (cur != end) { struct mock_dev *mdev; struct device *dev; int i; if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { rc = -EOPNOTSUPP; goto out; } if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) { rc = -EINVAL; goto out; } xa_lock(&viommu->vdevs); dev = iommufd_viommu_find_dev(viommu, (unsigned long)cur->vdev_id); if (!dev) { xa_unlock(&viommu->vdevs); rc = -EINVAL; goto out; } mdev = container_of(dev, struct mock_dev, dev); if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { /* Invalidate all cache entries and ignore cache_id */ for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) mdev->cache[i] = 0; } else { mdev->cache[cur->cache_id] = 0; } xa_unlock(&viommu->vdevs); cur++; } out: array->entry_num = cur - cmds; kfree(cmds); return rc; } static struct iommufd_viommu_ops mock_viommu_ops = { .destroy = mock_viommu_destroy, .alloc_domain_nested = mock_viommu_alloc_domain_nested, .cache_invalidate = mock_viommu_cache_invalidate, }; static struct iommufd_viommu *mock_viommu_alloc(struct device *dev, struct iommu_domain *domain, struct iommufd_ctx *ictx, unsigned int viommu_type) { struct mock_iommu_device *mock_iommu = iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev); struct mock_viommu *mock_viommu; if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST) return ERR_PTR(-EOPNOTSUPP); mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core, &mock_viommu_ops); if (IS_ERR(mock_viommu)) return ERR_CAST(mock_viommu); refcount_inc(&mock_iommu->users); return &mock_viommu->core; } static const struct iommu_ops mock_ops = { /* * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type() * because it is zero. */ .default_domain = &mock_blocking_domain, .blocked_domain = &mock_blocking_domain, .owner = THIS_MODULE, .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, .domain_alloc_paging_flags = mock_domain_alloc_paging_flags, .domain_alloc_nested = mock_domain_alloc_nested, .capable = mock_domain_capable, .device_group = generic_device_group, .probe_device = mock_probe_device, .page_response = mock_domain_page_response, .dev_enable_feat = mock_dev_enable_feat, .dev_disable_feat = mock_dev_disable_feat, .user_pasid_table = true, .viommu_alloc = mock_viommu_alloc, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, .attach_dev = mock_domain_nop_attach, .map_pages = mock_domain_map_pages, .unmap_pages = mock_domain_unmap_pages, .iova_to_phys = mock_domain_iova_to_phys, }, }; static void mock_domain_free_nested(struct iommu_domain *domain) { kfree(to_mock_nested(domain)); } static int mock_domain_cache_invalidate_user(struct iommu_domain *domain, struct iommu_user_data_array *array) { struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain); struct iommu_hwpt_invalidate_selftest inv; u32 processed = 0; int i = 0, j; int rc = 0; if (array->type != IOMMU_HWPT_INVALIDATE_DATA_SELFTEST) { rc = -EINVAL; goto out; } for ( ; i < array->entry_num; i++) { rc = iommu_copy_struct_from_user_array(&inv, array, IOMMU_HWPT_INVALIDATE_DATA_SELFTEST, i, iotlb_id); if (rc) break; if (inv.flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { rc = -EOPNOTSUPP; break; } if (inv.iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX) { rc = -EINVAL; break; } if (inv.flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { /* Invalidate all mock iotlb entries and ignore iotlb_id */ for (j = 0; j < MOCK_NESTED_DOMAIN_IOTLB_NUM; j++) mock_nested->iotlb[j] = 0; } else { mock_nested->iotlb[inv.iotlb_id] = 0; } processed++; } out: array->entry_num = processed; return rc; } static struct iommu_domain_ops domain_nested_ops = { .free = mock_domain_free_nested, .attach_dev = mock_domain_nop_attach, .cache_invalidate_user = mock_domain_cache_invalidate_user, }; static inline struct iommufd_hw_pagetable * __get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type) { struct iommufd_object *obj; obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type); if (IS_ERR(obj)) return ERR_CAST(obj); return container_of(obj, struct iommufd_hw_pagetable, obj); } static inline struct iommufd_hw_pagetable * get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, struct mock_iommu_domain **mock) { struct iommufd_hw_pagetable *hwpt; hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING); if (IS_ERR(hwpt)) return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || hwpt->domain->ops != mock_ops.default_domain_ops) { iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } *mock = to_mock_domain(hwpt->domain); return hwpt; } static inline struct iommufd_hw_pagetable * get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, struct mock_iommu_domain_nested **mock_nested) { struct iommufd_hw_pagetable *hwpt; hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED); if (IS_ERR(hwpt)) return hwpt; if (hwpt->domain->type != IOMMU_DOMAIN_NESTED || hwpt->domain->ops != &domain_nested_ops) { iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } *mock_nested = to_mock_nested(hwpt->domain); return hwpt; } static void mock_dev_release(struct device *dev) { struct mock_dev *mdev = to_mock_dev(dev); ida_free(&mock_dev_ida, mdev->id); kfree(mdev); } static struct mock_dev *mock_dev_create(unsigned long dev_flags) { struct mock_dev *mdev; int rc, i; if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) return ERR_PTR(-EINVAL); mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(-ENOMEM); device_initialize(&mdev->dev); mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT; rc = ida_alloc(&mock_dev_ida, GFP_KERNEL); if (rc < 0) goto err_put; mdev->id = rc; rc = dev_set_name(&mdev->dev, "iommufd_mock%u", mdev->id); if (rc) goto err_put; rc = device_add(&mdev->dev); if (rc) goto err_put; return mdev; err_put: put_device(&mdev->dev); return ERR_PTR(rc); } static void mock_dev_destroy(struct mock_dev *mdev) { device_unregister(&mdev->dev); } bool iommufd_selftest_is_mock_dev(struct device *dev) { return dev->release == mock_dev_release; } /* Create an hw_pagetable with the mock domain so we can test the domain ops */ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, struct iommu_test_cmd *cmd) { struct iommufd_device *idev; struct selftest_obj *sobj; u32 pt_id = cmd->id; u32 dev_flags = 0; u32 idev_id; int rc; sobj = iommufd_object_alloc(ucmd->ictx, sobj, IOMMUFD_OBJ_SELFTEST); if (IS_ERR(sobj)) return PTR_ERR(sobj); sobj->idev.ictx = ucmd->ictx; sobj->type = TYPE_IDEV; if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS) dev_flags = cmd->mock_domain_flags.dev_flags; sobj->idev.mock_dev = mock_dev_create(dev_flags); if (IS_ERR(sobj->idev.mock_dev)) { rc = PTR_ERR(sobj->idev.mock_dev); goto out_sobj; } idev = iommufd_device_bind(ucmd->ictx, &sobj->idev.mock_dev->dev, &idev_id); if (IS_ERR(idev)) { rc = PTR_ERR(idev); goto out_mdev; } sobj->idev.idev = idev; rc = iommufd_device_attach(idev, &pt_id); if (rc) goto out_unbind; /* Userspace must destroy the device_id to destroy the object */ cmd->mock_domain.out_hwpt_id = pt_id; cmd->mock_domain.out_stdev_id = sobj->obj.id; cmd->mock_domain.out_idev_id = idev_id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_detach; iommufd_object_finalize(ucmd->ictx, &sobj->obj); return 0; out_detach: iommufd_device_detach(idev); out_unbind: iommufd_device_unbind(idev); out_mdev: mock_dev_destroy(sobj->idev.mock_dev); out_sobj: iommufd_object_abort(ucmd->ictx, &sobj->obj); return rc; } /* Replace the mock domain with a manually allocated hw_pagetable */ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, unsigned int device_id, u32 pt_id, struct iommu_test_cmd *cmd) { struct iommufd_object *dev_obj; struct selftest_obj *sobj; int rc; /* * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure * it doesn't race with detach, which is not allowed. */ dev_obj = iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); if (IS_ERR(dev_obj)) return PTR_ERR(dev_obj); sobj = to_selftest_obj(dev_obj); if (sobj->type != TYPE_IDEV) { rc = -EINVAL; goto out_dev_obj; } rc = iommufd_device_replace(sobj->idev.idev, &pt_id); if (rc) goto out_dev_obj; cmd->mock_domain_replace.pt_id = pt_id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_dev_obj: iommufd_put_object(ucmd->ictx, dev_obj); return rc; } /* Add an additional reserved IOVA to the IOAS */ static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, unsigned long start, size_t length) { struct iommufd_ioas *ioas; int rc; ioas = iommufd_get_ioas(ucmd->ictx, mockpt_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); down_write(&ioas->iopt.iova_rwsem); rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); up_write(&ioas->iopt.iova_rwsem); iommufd_put_object(ucmd->ictx, &ioas->obj); return rc; } /* Check that every pfn under each iova matches the pfn under a user VA */ static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, unsigned long iova, size_t length, void __user *uptr) { struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; uintptr_t end; int rc; if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) return -EINVAL; hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); for (; length; length -= MOCK_IO_PAGE_SIZE) { struct page *pages[1]; unsigned long pfn; long npages; void *ent; npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, pages); if (npages < 0) { rc = npages; goto out_put; } if (WARN_ON(npages != 1)) { rc = -EFAULT; goto out_put; } pfn = page_to_pfn(pages[0]); put_page(pages[0]); ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); if (!ent || (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { rc = -EINVAL; goto out_put; } iova += MOCK_IO_PAGE_SIZE; uptr += MOCK_IO_PAGE_SIZE; } rc = 0; out_put: iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } /* Check that the page ref count matches, to look for missing pin/unpins */ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, void __user *uptr, size_t length, unsigned int refs) { uintptr_t end; if (length % PAGE_SIZE || (uintptr_t)uptr % PAGE_SIZE || check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) return -EINVAL; for (; length; length -= PAGE_SIZE) { struct page *pages[1]; long npages; npages = get_user_pages_fast((uintptr_t)uptr, 1, 0, pages); if (npages < 0) return npages; if (WARN_ON(npages != 1)) return -EFAULT; if (!PageCompound(pages[0])) { unsigned int count; count = page_ref_count(pages[0]); if (count / GUP_PIN_COUNTING_BIAS != refs) { put_page(pages[0]); return -EIO; } } put_page(pages[0]); uptr += PAGE_SIZE; } return 0; } static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, u32 mockpt_id, unsigned int iotlb_id, u32 iotlb) { struct mock_iommu_domain_nested *mock_nested; struct iommufd_hw_pagetable *hwpt; int rc = 0; hwpt = get_md_pagetable_nested(ucmd, mockpt_id, &mock_nested); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); mock_nested = to_mock_nested(hwpt->domain); if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX || mock_nested->iotlb[iotlb_id] != iotlb) rc = -EINVAL; iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id, unsigned int cache_id, u32 cache) { struct iommufd_device *idev; struct mock_dev *mdev; int rc = 0; idev = iommufd_get_device(ucmd, idev_id); if (IS_ERR(idev)) return PTR_ERR(idev); mdev = container_of(idev->dev, struct mock_dev, dev); if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache) rc = -EINVAL; iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } struct selftest_access { struct iommufd_access *access; struct file *file; struct mutex lock; struct list_head items; unsigned int next_id; bool destroying; }; struct selftest_access_item { struct list_head items_elm; unsigned long iova; size_t length; unsigned int id; }; static const struct file_operations iommfd_test_staccess_fops; static struct selftest_access *iommufd_access_get(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADFD); if (file->f_op != &iommfd_test_staccess_fops) { fput(file); return ERR_PTR(-EBADFD); } return file->private_data; } static void iommufd_test_access_unmap(void *data, unsigned long iova, unsigned long length) { unsigned long iova_last = iova + length - 1; struct selftest_access *staccess = data; struct selftest_access_item *item; struct selftest_access_item *tmp; mutex_lock(&staccess->lock); list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) { if (iova > item->iova + item->length - 1 || iova_last < item->iova) continue; list_del(&item->items_elm); iommufd_access_unpin_pages(staccess->access, item->iova, item->length); kfree(item); } mutex_unlock(&staccess->lock); } static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd, unsigned int access_id, unsigned int item_id) { struct selftest_access_item *item; struct selftest_access *staccess; staccess = iommufd_access_get(access_id); if (IS_ERR(staccess)) return PTR_ERR(staccess); mutex_lock(&staccess->lock); list_for_each_entry(item, &staccess->items, items_elm) { if (item->id == item_id) { list_del(&item->items_elm); iommufd_access_unpin_pages(staccess->access, item->iova, item->length); mutex_unlock(&staccess->lock); kfree(item); fput(staccess->file); return 0; } } mutex_unlock(&staccess->lock); fput(staccess->file); return -ENOENT; } static int iommufd_test_staccess_release(struct inode *inode, struct file *filep) { struct selftest_access *staccess = filep->private_data; if (staccess->access) { iommufd_test_access_unmap(staccess, 0, ULONG_MAX); iommufd_access_destroy(staccess->access); } mutex_destroy(&staccess->lock); kfree(staccess); return 0; } static const struct iommufd_access_ops selftest_access_ops_pin = { .needs_pin_pages = 1, .unmap = iommufd_test_access_unmap, }; static const struct iommufd_access_ops selftest_access_ops = { .unmap = iommufd_test_access_unmap, }; static const struct file_operations iommfd_test_staccess_fops = { .release = iommufd_test_staccess_release, }; static struct selftest_access *iommufd_test_alloc_access(void) { struct selftest_access *staccess; struct file *filep; staccess = kzalloc(sizeof(*staccess), GFP_KERNEL_ACCOUNT); if (!staccess) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&staccess->items); mutex_init(&staccess->lock); filep = anon_inode_getfile("[iommufd_test_staccess]", &iommfd_test_staccess_fops, staccess, O_RDWR); if (IS_ERR(filep)) { kfree(staccess); return ERR_CAST(filep); } staccess->file = filep; return staccess; } static int iommufd_test_create_access(struct iommufd_ucmd *ucmd, unsigned int ioas_id, unsigned int flags) { struct iommu_test_cmd *cmd = ucmd->cmd; struct selftest_access *staccess; struct iommufd_access *access; u32 id; int fdno; int rc; if (flags & ~MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) return -EOPNOTSUPP; staccess = iommufd_test_alloc_access(); if (IS_ERR(staccess)) return PTR_ERR(staccess); fdno = get_unused_fd_flags(O_CLOEXEC); if (fdno < 0) { rc = -ENOMEM; goto out_free_staccess; } access = iommufd_access_create( ucmd->ictx, (flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ? &selftest_access_ops_pin : &selftest_access_ops, staccess, &id); if (IS_ERR(access)) { rc = PTR_ERR(access); goto out_put_fdno; } rc = iommufd_access_attach(access, ioas_id); if (rc) goto out_destroy; cmd->create_access.out_access_fd = fdno; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_destroy; staccess->access = access; fd_install(fdno, staccess->file); return 0; out_destroy: iommufd_access_destroy(access); out_put_fdno: put_unused_fd(fdno); out_free_staccess: fput(staccess->file); return rc; } static int iommufd_test_access_replace_ioas(struct iommufd_ucmd *ucmd, unsigned int access_id, unsigned int ioas_id) { struct selftest_access *staccess; int rc; staccess = iommufd_access_get(access_id); if (IS_ERR(staccess)) return PTR_ERR(staccess); rc = iommufd_access_replace(staccess->access, ioas_id); fput(staccess->file); return rc; } /* Check that the pages in a page array match the pages in the user VA */ static int iommufd_test_check_pages(void __user *uptr, struct page **pages, size_t npages) { for (; npages; npages--) { struct page *tmp_pages[1]; long rc; rc = get_user_pages_fast((uintptr_t)uptr, 1, 0, tmp_pages); if (rc < 0) return rc; if (WARN_ON(rc != 1)) return -EFAULT; put_page(tmp_pages[0]); if (tmp_pages[0] != *pages) return -EBADE; pages++; uptr += PAGE_SIZE; } return 0; } static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, unsigned int access_id, unsigned long iova, size_t length, void __user *uptr, u32 flags) { struct iommu_test_cmd *cmd = ucmd->cmd; struct selftest_access_item *item; struct selftest_access *staccess; struct page **pages; size_t npages; int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ if (length > 16*1024*1024) return -ENOMEM; if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) return -EOPNOTSUPP; staccess = iommufd_access_get(access_id); if (IS_ERR(staccess)) return PTR_ERR(staccess); if (staccess->access->ops != &selftest_access_ops_pin) { rc = -EOPNOTSUPP; goto out_put; } if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, &cmd->access_pages.iova); npages = (ALIGN(iova + length, PAGE_SIZE) - ALIGN_DOWN(iova, PAGE_SIZE)) / PAGE_SIZE; pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT); if (!pages) { rc = -ENOMEM; goto out_put; } /* * Drivers will need to think very carefully about this locking. The * core code can do multiple unmaps instantaneously after * iommufd_access_pin_pages() and *all* the unmaps must not return until * the range is unpinned. This simple implementation puts a global lock * around the pin, which may not suit drivers that want this to be a * performance path. drivers that get this wrong will trigger WARN_ON * races and cause EDEADLOCK failures to userspace. */ mutex_lock(&staccess->lock); rc = iommufd_access_pin_pages(staccess->access, iova, length, pages, flags & MOCK_FLAGS_ACCESS_WRITE); if (rc) goto out_unlock; /* For syzkaller allow uptr to be NULL to skip this check */ if (uptr) { rc = iommufd_test_check_pages( uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages, npages); if (rc) goto out_unaccess; } item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT); if (!item) { rc = -ENOMEM; goto out_unaccess; } item->iova = iova; item->length = length; item->id = staccess->next_id++; list_add_tail(&item->items_elm, &staccess->items); cmd->access_pages.out_access_pages_id = item->id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_free_item; goto out_unlock; out_free_item: list_del(&item->items_elm); kfree(item); out_unaccess: iommufd_access_unpin_pages(staccess->access, iova, length); out_unlock: mutex_unlock(&staccess->lock); kvfree(pages); out_put: fput(staccess->file); return rc; } static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, unsigned int access_id, unsigned long iova, size_t length, void __user *ubuf, unsigned int flags) { struct iommu_test_cmd *cmd = ucmd->cmd; struct selftest_access *staccess; void *tmp; int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ if (length > 16*1024*1024) return -ENOMEM; if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | MOCK_FLAGS_ACCESS_SYZ)) return -EOPNOTSUPP; staccess = iommufd_access_get(access_id); if (IS_ERR(staccess)) return PTR_ERR(staccess); tmp = kvzalloc(length, GFP_KERNEL_ACCOUNT); if (!tmp) { rc = -ENOMEM; goto out_put; } if (flags & MOCK_ACCESS_RW_WRITE) { if (copy_from_user(tmp, ubuf, length)) { rc = -EFAULT; goto out_free; } } if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, &cmd->access_rw.iova); rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); if (rc) goto out_free; if (!(flags & MOCK_ACCESS_RW_WRITE)) { if (copy_to_user(ubuf, tmp, length)) { rc = -EFAULT; goto out_free; } } out_free: kvfree(tmp); out_put: fput(staccess->file); return rc; } static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == __IOMMUFD_ACCESS_RW_SLOW_PATH); static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, unsigned long iova, size_t length, unsigned long page_size, void __user *uptr, u32 flags) { unsigned long i, max; struct iommu_test_cmd *cmd = ucmd->cmd; struct iommufd_hw_pagetable *hwpt; struct mock_iommu_domain *mock; int rc, count = 0; void *tmp; if (!page_size || !length || iova % page_size || length % page_size || !uptr) return -EINVAL; hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); if (IS_ERR(hwpt)) return PTR_ERR(hwpt); if (!(mock->flags & MOCK_DIRTY_TRACK)) { rc = -EINVAL; goto out_put; } max = length / page_size; tmp = kvzalloc(DIV_ROUND_UP(max, BITS_PER_LONG) * sizeof(unsigned long), GFP_KERNEL_ACCOUNT); if (!tmp) { rc = -ENOMEM; goto out_put; } if (copy_from_user(tmp, uptr,DIV_ROUND_UP(max, BITS_PER_BYTE))) { rc = -EFAULT; goto out_free; } for (i = 0; i < max; i++) { unsigned long cur = iova + i * page_size; void *ent, *old; if (!test_bit(i, (unsigned long *)tmp)) continue; ent = xa_load(&mock->pfns, cur / page_size); if (ent) { unsigned long val; val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; old = xa_store(&mock->pfns, cur / page_size, xa_mk_value(val), GFP_KERNEL); WARN_ON_ONCE(ent != old); count++; } } cmd->dirty.out_nr_dirty = count; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kvfree(tmp); out_put: iommufd_put_object(ucmd->ictx, &hwpt->obj); return rc; } static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, struct iommu_test_cmd *cmd) { struct iopf_fault event = { }; struct iommufd_device *idev; idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); event.fault.prm.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; if (cmd->trigger_iopf.pasid != IOMMU_NO_PASID) event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; event.fault.type = IOMMU_FAULT_PAGE_REQ; event.fault.prm.addr = cmd->trigger_iopf.addr; event.fault.prm.pasid = cmd->trigger_iopf.pasid; event.fault.prm.grpid = cmd->trigger_iopf.grpid; event.fault.prm.perm = cmd->trigger_iopf.perm; iommu_report_device_fault(idev->dev, &event); iommufd_put_object(ucmd->ictx, &idev->obj); return 0; } void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = to_selftest_obj(obj); switch (sobj->type) { case TYPE_IDEV: iommufd_device_detach(sobj->idev.idev); iommufd_device_unbind(sobj->idev.idev); mock_dev_destroy(sobj->idev.mock_dev); break; } } int iommufd_test(struct iommufd_ucmd *ucmd) { struct iommu_test_cmd *cmd = ucmd->cmd; switch (cmd->op) { case IOMMU_TEST_OP_ADD_RESERVED: return iommufd_test_add_reserved(ucmd, cmd->id, cmd->add_reserved.start, cmd->add_reserved.length); case IOMMU_TEST_OP_MOCK_DOMAIN: case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS: return iommufd_test_mock_domain(ucmd, cmd); case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE: return iommufd_test_mock_domain_replace( ucmd, cmd->id, cmd->mock_domain_replace.pt_id, cmd); case IOMMU_TEST_OP_MD_CHECK_MAP: return iommufd_test_md_check_pa( ucmd, cmd->id, cmd->check_map.iova, cmd->check_map.length, u64_to_user_ptr(cmd->check_map.uptr)); case IOMMU_TEST_OP_MD_CHECK_REFS: return iommufd_test_md_check_refs( ucmd, u64_to_user_ptr(cmd->check_refs.uptr), cmd->check_refs.length, cmd->check_refs.refs); case IOMMU_TEST_OP_MD_CHECK_IOTLB: return iommufd_test_md_check_iotlb(ucmd, cmd->id, cmd->check_iotlb.id, cmd->check_iotlb.iotlb); case IOMMU_TEST_OP_DEV_CHECK_CACHE: return iommufd_test_dev_check_cache(ucmd, cmd->id, cmd->check_dev_cache.id, cmd->check_dev_cache.cache); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); case IOMMU_TEST_OP_ACCESS_REPLACE_IOAS: return iommufd_test_access_replace_ioas( ucmd, cmd->id, cmd->access_replace_ioas.ioas_id); case IOMMU_TEST_OP_ACCESS_PAGES: return iommufd_test_access_pages( ucmd, cmd->id, cmd->access_pages.iova, cmd->access_pages.length, u64_to_user_ptr(cmd->access_pages.uptr), cmd->access_pages.flags); case IOMMU_TEST_OP_ACCESS_RW: return iommufd_test_access_rw( ucmd, cmd->id, cmd->access_rw.iova, cmd->access_rw.length, u64_to_user_ptr(cmd->access_rw.uptr), cmd->access_rw.flags); case IOMMU_TEST_OP_DESTROY_ACCESS_PAGES: return iommufd_test_access_item_destroy( ucmd, cmd->id, cmd->destroy_access_pages.access_pages_id); case IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT: /* Protect _batch_init(), can not be less than elmsz */ if (cmd->memory_limit.limit < sizeof(unsigned long) + sizeof(u32)) return -EINVAL; iommufd_test_memory_limit = cmd->memory_limit.limit; return 0; case IOMMU_TEST_OP_DIRTY: return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova, cmd->dirty.length, cmd->dirty.page_size, u64_to_user_ptr(cmd->dirty.uptr), cmd->dirty.flags); case IOMMU_TEST_OP_TRIGGER_IOPF: return iommufd_test_trigger_iopf(ucmd, cmd); default: return -EOPNOTSUPP; } } bool iommufd_should_fail(void) { return should_fail(&fail_iommufd, 1); } int __init iommufd_test_init(void) { struct platform_device_info pdevinfo = { .name = "iommufd_selftest_iommu", }; int rc; dbgfs_root = fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd); selftest_iommu_dev = platform_device_register_full(&pdevinfo); if (IS_ERR(selftest_iommu_dev)) { rc = PTR_ERR(selftest_iommu_dev); goto err_dbgfs; } rc = bus_register(&iommufd_mock_bus_type.bus); if (rc) goto err_platform; rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev, &selftest_iommu_dev->dev, NULL, "%s", dev_name(&selftest_iommu_dev->dev)); if (rc) goto err_bus; rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; refcount_set(&mock_iommu.users, 1); init_completion(&mock_iommu.complete); mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); return 0; err_sysfs: iommu_device_sysfs_remove(&mock_iommu.iommu_dev); err_bus: bus_unregister(&iommufd_mock_bus_type.bus); err_platform: platform_device_unregister(selftest_iommu_dev); err_dbgfs: debugfs_remove_recursive(dbgfs_root); return rc; } static void iommufd_test_wait_for_users(void) { if (refcount_dec_and_test(&mock_iommu.users)) return; /* * Time out waiting for iommu device user count to become 0. * * Note that this is just making an example here, since the selftest is * built into the iommufd module, i.e. it only unplugs the iommu device * when unloading the module. So, it is expected that this WARN_ON will * not trigger, as long as any iommufd FDs are open. */ WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete, msecs_to_jiffies(10000))); } void iommufd_test_exit(void) { if (mock_iommu_iopf_queue) { iopf_queue_free(mock_iommu_iopf_queue); mock_iommu_iopf_queue = NULL; } iommufd_test_wait_for_users(); iommu_device_sysfs_remove(&mock_iommu.iommu_dev); iommu_device_unregister_bus(&mock_iommu.iommu_dev, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); bus_unregister(&iommufd_mock_bus_type.bus); platform_device_unregister(selftest_iommu_dev); debugfs_remove_recursive(dbgfs_root); }
7 9 9 13 13 9 8 2 2 8 8 13 13 22 3 3 3 3 3 41 41 41 20 20 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 // SPDX-License-Identifier: GPL-2.0-only /* * count the number of connections matching an arbitrary key. * * (C) 2017 Red Hat GmbH * Author: Florian Westphal <fw@strlen.de> * * split from xt_connlimit.c: * (c) 2000 Gerd Knorr <kraxel@bytesex.org> * Nov 2002: Martin Bene <martin.bene@icomedias.com>: * only ignore TIME_WAIT or gone connections * (C) CC Computer Consultants GmbH, 2007 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/module.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_count.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> #define CONNCOUNT_SLOTS 256U #define CONNCOUNT_GC_MAX_NODES 8 #define MAX_KEYLEN 5 /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { struct list_head node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int cpu; u32 jiffies32; }; struct nf_conncount_rb { struct rb_node node; struct nf_conncount_list list; u32 key[MAX_KEYLEN]; struct rcu_head rcu_head; }; static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp; struct nf_conncount_data { unsigned int keylen; struct rb_root root[CONNCOUNT_SLOTS]; struct net *net; struct work_struct gc_work; unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; unsigned int gc_tree; }; static u_int32_t conncount_rnd __read_mostly; static struct kmem_cache *conncount_rb_cachep __read_mostly; static struct kmem_cache *conncount_conn_cachep __read_mostly; static inline bool already_closed(const struct nf_conn *conn) { if (nf_ct_protonum(conn) == IPPROTO_TCP) return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; else return false; } static int key_diff(const u32 *a, const u32 *b, unsigned int klen) { return memcmp(a, b, klen * sizeof(u32)); } static void conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { lockdep_assert_held(&list->list_lock); list->count--; list_del(&conn->node); kmem_cache_free(conncount_conn_cachep, conn); } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; int cpu = raw_smp_processor_id(); u32 age; found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found) return found; b = conn->jiffies32; a = (u32)jiffies; /* conn might have been added just before by another cpu and * might still be unconfirmed. In this case, nf_conntrack_find() * returns no result. Thus only evict if this cpu added the * stale entry or if the entry is older than two jiffies. */ age = a - b; if (conn->cpu == cpu || age >= 2) { conn_free(list, conn); return ERR_PTR(-ENOENT); } return ERR_PTR(-EAGAIN); } static int __nf_conncount_add(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; if (time_is_after_eq_jiffies((unsigned long)list->last_gc)) goto add_new_node; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) break; found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) return 0; /* already exists */ } else { collect++; } continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". * * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); return 0; } else if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); conn_free(list, conn); collect++; continue; } nf_ct_put(found_ct); } add_new_node: if (WARN_ON_ONCE(list->count > INT_MAX)) return -EOVERFLOW; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) return -ENOMEM; conn->tuple = *tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; list->last_gc = (u32)jiffies; return 0; } int nf_conncount_add(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); ret = __nf_conncount_add(net, list, tuple, zone); spin_unlock_bh(&list->list_lock); return ret; } EXPORT_SYMBOL_GPL(nf_conncount_add); void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; list->last_gc = (u32)jiffies; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; bool ret = false; /* don't bother if we just did GC */ if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc))) return false; /* don't bother if other cpu is already doing GC */ if (!spin_trylock(&list->list_lock)) return false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn); if (IS_ERR(found)) { if (PTR_ERR(found) == -ENOENT) collected++; continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); conn_free(list, conn); collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) break; } if (!list->count) ret = true; list->last_gc = (u32)jiffies; spin_unlock(&list->list_lock); return ret; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); static void __tree_nodes_free(struct rcu_head *h) { struct nf_conncount_rb *rbconn; rbconn = container_of(h, struct nf_conncount_rb, rcu_head); kmem_cache_free(conncount_rb_cachep, rbconn); } /* caller must hold tree nf_conncount_locks[] lock */ static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) { struct nf_conncount_rb *rbconn; while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); if (!rbconn->list.count) { rb_erase(&rbconn->node, root); call_rcu(&rbconn->rcu_head, __tree_nodes_free); } spin_unlock(&rbconn->list.list_lock); } } static void schedule_gc_worker(struct nf_conncount_data *data, int tree) { set_bit(tree, data->pending_trees); schedule_work(&data->gc_work); } static unsigned int insert_tree(struct net *net, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); restart: parent = NULL; rbnode = &(root->rb_node); while (*rbnode) { int diff; rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); parent = *rbnode; diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { rbnode = &((*rbnode)->rb_right); } else { int ret; ret = nf_conncount_add(net, &rbconn->list, tuple, zone); if (ret) count = 0; /* hotdrop */ else count = rbconn->list.count; tree_nodes_free(root, gc_nodes, gc_count); goto out_unlock; } if (gc_count >= ARRAY_SIZE(gc_nodes)) continue; if (do_gc && nf_conncount_gc_list(net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } if (gc_count) { tree_nodes_free(root, gc_nodes, gc_count); schedule_gc_worker(data, hash); gc_count = 0; do_gc = false; goto restart; } /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) goto out_unlock; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) { kmem_cache_free(conncount_rb_cachep, rbconn); goto out_unlock; } conn->tuple = *tuple; conn->zone = *zone; memcpy(rbconn->key, key, sizeof(u32) * data->keylen); nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); count = 1; rbconn->list.count = count; rb_link_node_rcu(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; } static unsigned int count_tree(struct net *net, struct nf_conncount_data *data, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; unsigned int hash; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; parent = rcu_dereference_raw(root->rb_node); while (parent) { int diff; rbconn = rb_entry(parent, struct nf_conncount_rb, node); diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { parent = rcu_dereference_raw(parent->rb_right); } else { int ret; if (!tuple) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } spin_lock_bh(&rbconn->list.list_lock); /* Node might be about to be free'd. * We need to defer to insert_tree() in this case. */ if (rbconn->list.count == 0) { spin_unlock_bh(&rbconn->list.list_lock); break; } /* same source network -> be counted! */ ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); spin_unlock_bh(&rbconn->list.list_lock); if (ret) return 0; /* hotdrop */ else return rbconn->list.count; } } if (!tuple) return 0; return insert_tree(net, data, root, hash, key, tuple, zone); } static void tree_gc_worker(struct work_struct *work) { struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; struct rb_root *root; struct rb_node *node; unsigned int tree, next_tree, gc_count = 0; tree = data->gc_tree % CONNCOUNT_SLOTS; root = &data->root[tree]; local_bh_disable(); rcu_read_lock(); for (node = rb_first(root); node != NULL; node = rb_next(node)) { rbconn = rb_entry(node, struct nf_conncount_rb, node); if (nf_conncount_gc_list(data->net, &rbconn->list)) gc_count++; } rcu_read_unlock(); local_bh_enable(); cond_resched(); spin_lock_bh(&nf_conncount_locks[tree]); if (gc_count < ARRAY_SIZE(gc_nodes)) goto next; /* do not bother */ gc_count = 0; node = rb_first(root); while (node != NULL) { rbconn = rb_entry(node, struct nf_conncount_rb, node); node = rb_next(node); if (rbconn->list.count > 0) continue; gc_nodes[gc_count++] = rbconn; if (gc_count >= ARRAY_SIZE(gc_nodes)) { tree_nodes_free(root, gc_nodes, gc_count); gc_count = 0; } } tree_nodes_free(root, gc_nodes, gc_count); next: clear_bit(tree, data->pending_trees); next_tree = (tree + 1) % CONNCOUNT_SLOTS; next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree); if (next_tree < CONNCOUNT_SLOTS) { data->gc_tree = next_tree; schedule_work(work); } spin_unlock_bh(&nf_conncount_locks[tree]); } /* Count and return number of conntrack entries in 'net' with particular 'key'. * If 'tuple' is not null, insert it into the accounting data structure. * Call with RCU read lock. */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { return count_tree(net, data, key, tuple, zone); } EXPORT_SYMBOL_GPL(nf_conncount_count); struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { struct nf_conncount_data *data; int i; if (keylen % sizeof(u32) || keylen / sizeof(u32) > MAX_KEYLEN || keylen == 0) return ERR_PTR(-EINVAL); net_get_random_once(&conncount_rnd, sizeof(conncount_rnd)); data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); for (i = 0; i < ARRAY_SIZE(data->root); ++i) data->root[i] = RB_ROOT; data->keylen = keylen / sizeof(u32); data->net = net; INIT_WORK(&data->gc_work, tree_gc_worker); return data; } EXPORT_SYMBOL_GPL(nf_conncount_init); void nf_conncount_cache_free(struct nf_conncount_list *list) { struct nf_conncount_tuple *conn, *conn_n; list_for_each_entry_safe(conn, conn_n, &list->head, node) kmem_cache_free(conncount_conn_cachep, conn); } EXPORT_SYMBOL_GPL(nf_conncount_cache_free); static void destroy_tree(struct rb_root *r) { struct nf_conncount_rb *rbconn; struct rb_node *node; while ((node = rb_first(r)) != NULL) { rbconn = rb_entry(node, struct nf_conncount_rb, node); rb_erase(node, r); nf_conncount_cache_free(&rbconn->list); kmem_cache_free(conncount_rb_cachep, rbconn); } } void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data) { unsigned int i; cancel_work_sync(&data->gc_work); for (i = 0; i < ARRAY_SIZE(data->root); ++i) destroy_tree(&data->root[i]); kfree(data); } EXPORT_SYMBOL_GPL(nf_conncount_destroy); static int __init nf_conncount_modinit(void) { int i; for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0); if (!conncount_conn_cachep) return -ENOMEM; conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0); if (!conncount_rb_cachep) { kmem_cache_destroy(conncount_conn_cachep); return -ENOMEM; } return 0; } static void __exit nf_conncount_modexit(void) { kmem_cache_destroy(conncount_conn_cachep); kmem_cache_destroy(conncount_rb_cachep); } module_init(nf_conncount_modinit); module_exit(nf_conncount_modexit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("netfilter: count number of connections matching a key"); MODULE_LICENSE("GPL");
1426 919 2 342 217 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a * range type is done by examining the immutable tree flag for the * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0x??1 : Root * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 * 0x010 : 32 bit values, type in 0-2, slot in 3-6 * 0x110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; unsigned char gap; }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) /** * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__min: minimum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each_rev(__mas, __entry, __min) \ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */
3 1 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 // SPDX-License-Identifier: GPL-2.0 /* Some of this code is credited to Linux USB open source files that are distributed with Linux. Copyright: 2007 Metrologic Instruments. All rights reserved. Copyright: 2011 Azimut Ltd. <http://azimutrzn.ru/> */ #include <linux/kernel.h> #include <linux/tty.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb/serial.h> #define DRIVER_DESC "Metrologic Instruments Inc. - USB-POS driver" /* Product information. */ #define FOCUS_VENDOR_ID 0x0C2E #define FOCUS_PRODUCT_ID_BI 0x0720 #define FOCUS_PRODUCT_ID_UNI 0x0700 #define METROUSB_SET_REQUEST_TYPE 0x40 #define METROUSB_SET_MODEM_CTRL_REQUEST 10 #define METROUSB_SET_BREAK_REQUEST 0x40 #define METROUSB_MCR_NONE 0x08 /* Deactivate DTR and RTS. */ #define METROUSB_MCR_RTS 0x0a /* Activate RTS. */ #define METROUSB_MCR_DTR 0x09 /* Activate DTR. */ #define WDR_TIMEOUT 5000 /* default urb timeout. */ /* Private data structure. */ struct metrousb_private { spinlock_t lock; int throttled; unsigned long control_state; }; /* Device table list. */ static const struct usb_device_id id_table[] = { { USB_DEVICE(FOCUS_VENDOR_ID, FOCUS_PRODUCT_ID_BI) }, { USB_DEVICE(FOCUS_VENDOR_ID, FOCUS_PRODUCT_ID_UNI) }, { USB_DEVICE_INTERFACE_CLASS(0x0c2e, 0x0730, 0xff) }, /* MS7820 */ { }, /* Terminating entry. */ }; MODULE_DEVICE_TABLE(usb, id_table); /* UNI-Directional mode commands for device configure */ #define UNI_CMD_OPEN 0x80 #define UNI_CMD_CLOSE 0xFF static int metrousb_is_unidirectional_mode(struct usb_serial *serial) { u16 product_id = le16_to_cpu(serial->dev->descriptor.idProduct); return product_id == FOCUS_PRODUCT_ID_UNI; } static int metrousb_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { if (metrousb_is_unidirectional_mode(serial)) { if (epds->num_interrupt_out == 0) { dev_err(&serial->interface->dev, "interrupt-out endpoint missing\n"); return -ENODEV; } } return 1; } static int metrousb_send_unidirectional_cmd(u8 cmd, struct usb_serial_port *port) { int ret; int actual_len; u8 *buffer_cmd = NULL; if (!metrousb_is_unidirectional_mode(port->serial)) return 0; buffer_cmd = kzalloc(sizeof(cmd), GFP_KERNEL); if (!buffer_cmd) return -ENOMEM; *buffer_cmd = cmd; ret = usb_interrupt_msg(port->serial->dev, usb_sndintpipe(port->serial->dev, port->interrupt_out_endpointAddress), buffer_cmd, sizeof(cmd), &actual_len, USB_CTRL_SET_TIMEOUT); kfree(buffer_cmd); if (ret < 0) return ret; else if (actual_len != sizeof(cmd)) return -EIO; return 0; } static void metrousb_read_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; struct metrousb_private *metro_priv = usb_get_serial_port_data(port); unsigned char *data = urb->transfer_buffer; unsigned long flags; int throttled = 0; int result = 0; dev_dbg(&port->dev, "%s\n", __func__); switch (urb->status) { case 0: /* Success status, read from the port. */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* urb has been terminated. */ dev_dbg(&port->dev, "%s - urb shutting down, error code=%d\n", __func__, urb->status); return; default: dev_dbg(&port->dev, "%s - non-zero urb received, error code=%d\n", __func__, urb->status); goto exit; } /* Set the data read from the usb port into the serial port buffer. */ if (urb->actual_length) { /* Loop through the data copying each byte to the tty layer. */ tty_insert_flip_string(&port->port, data, urb->actual_length); /* Force the data to the tty layer. */ tty_flip_buffer_push(&port->port); } /* Set any port variables. */ spin_lock_irqsave(&metro_priv->lock, flags); throttled = metro_priv->throttled; spin_unlock_irqrestore(&metro_priv->lock, flags); if (throttled) return; exit: /* Try to resubmit the urb. */ result = usb_submit_urb(urb, GFP_ATOMIC); if (result) dev_err(&port->dev, "%s - failed submitting interrupt in urb, error code=%d\n", __func__, result); } static void metrousb_cleanup(struct usb_serial_port *port) { usb_kill_urb(port->interrupt_in_urb); metrousb_send_unidirectional_cmd(UNI_CMD_CLOSE, port); } static int metrousb_open(struct tty_struct *tty, struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct metrousb_private *metro_priv = usb_get_serial_port_data(port); unsigned long flags; int result = 0; /* Set the private data information for the port. */ spin_lock_irqsave(&metro_priv->lock, flags); metro_priv->control_state = 0; metro_priv->throttled = 0; spin_unlock_irqrestore(&metro_priv->lock, flags); /* Clear the urb pipe. */ usb_clear_halt(serial->dev, port->interrupt_in_urb->pipe); /* Start reading from the device */ usb_fill_int_urb(port->interrupt_in_urb, serial->dev, usb_rcvintpipe(serial->dev, port->interrupt_in_endpointAddress), port->interrupt_in_urb->transfer_buffer, port->interrupt_in_urb->transfer_buffer_length, metrousb_read_int_callback, port, 1); result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) { dev_err(&port->dev, "%s - failed submitting interrupt in urb, error code=%d\n", __func__, result); return result; } /* Send activate cmd to device */ result = metrousb_send_unidirectional_cmd(UNI_CMD_OPEN, port); if (result) { dev_err(&port->dev, "%s - failed to configure device, error code=%d\n", __func__, result); goto err_kill_urb; } return 0; err_kill_urb: usb_kill_urb(port->interrupt_in_urb); return result; } static int metrousb_set_modem_ctrl(struct usb_serial *serial, unsigned int control_state) { int retval = 0; unsigned char mcr = METROUSB_MCR_NONE; dev_dbg(&serial->dev->dev, "%s - control state = %d\n", __func__, control_state); /* Set the modem control value. */ if (control_state & TIOCM_DTR) mcr |= METROUSB_MCR_DTR; if (control_state & TIOCM_RTS) mcr |= METROUSB_MCR_RTS; /* Send the command to the usb port. */ retval = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), METROUSB_SET_REQUEST_TYPE, METROUSB_SET_MODEM_CTRL_REQUEST, control_state, 0, NULL, 0, WDR_TIMEOUT); if (retval < 0) dev_err(&serial->dev->dev, "%s - set modem ctrl=0x%x failed, error code=%d\n", __func__, mcr, retval); return retval; } static int metrousb_port_probe(struct usb_serial_port *port) { struct metrousb_private *metro_priv; metro_priv = kzalloc(sizeof(*metro_priv), GFP_KERNEL); if (!metro_priv) return -ENOMEM; spin_lock_init(&metro_priv->lock); usb_set_serial_port_data(port, metro_priv); return 0; } static void metrousb_port_remove(struct usb_serial_port *port) { struct metrousb_private *metro_priv; metro_priv = usb_get_serial_port_data(port); kfree(metro_priv); } static void metrousb_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct metrousb_private *metro_priv = usb_get_serial_port_data(port); unsigned long flags; /* Set the private information for the port to stop reading data. */ spin_lock_irqsave(&metro_priv->lock, flags); metro_priv->throttled = 1; spin_unlock_irqrestore(&metro_priv->lock, flags); } static int metrousb_tiocmget(struct tty_struct *tty) { unsigned long control_state = 0; struct usb_serial_port *port = tty->driver_data; struct metrousb_private *metro_priv = usb_get_serial_port_data(port); unsigned long flags; spin_lock_irqsave(&metro_priv->lock, flags); control_state = metro_priv->control_state; spin_unlock_irqrestore(&metro_priv->lock, flags); return control_state; } static int metrousb_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; struct metrousb_private *metro_priv = usb_get_serial_port_data(port); unsigned long flags; unsigned long control_state = 0; dev_dbg(&port->dev, "%s - set=%d, clear=%d\n", __func__, set, clear); spin_lock_irqsave(&metro_priv->lock, flags); control_state = metro_priv->control_state; /* Set the RTS and DTR values. */ if (set & TIOCM_RTS) control_state |= TIOCM_RTS; if (set & TIOCM_DTR) control_state |= TIOCM_DTR; if (clear & TIOCM_RTS) control_state &= ~TIOCM_RTS; if (clear & TIOCM_DTR) control_state &= ~TIOCM_DTR; metro_priv->control_state = control_state; spin_unlock_irqrestore(&metro_priv->lock, flags); return metrousb_set_modem_ctrl(serial, control_state); } static void metrousb_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct metrousb_private *metro_priv = usb_get_serial_port_data(port); unsigned long flags; int result = 0; /* Set the private information for the port to resume reading data. */ spin_lock_irqsave(&metro_priv->lock, flags); metro_priv->throttled = 0; spin_unlock_irqrestore(&metro_priv->lock, flags); /* Submit the urb to read from the port. */ result = usb_submit_urb(port->interrupt_in_urb, GFP_ATOMIC); if (result) dev_err(&port->dev, "failed submitting interrupt in urb error code=%d\n", result); } static struct usb_serial_driver metrousb_device = { .driver = { .name = "metro-usb", }, .description = "Metrologic USB to Serial", .id_table = id_table, .num_interrupt_in = 1, .calc_num_ports = metrousb_calc_num_ports, .open = metrousb_open, .close = metrousb_cleanup, .read_int_callback = metrousb_read_int_callback, .port_probe = metrousb_port_probe, .port_remove = metrousb_port_remove, .throttle = metrousb_throttle, .unthrottle = metrousb_unthrottle, .tiocmget = metrousb_tiocmget, .tiocmset = metrousb_tiocmset, }; static struct usb_serial_driver * const serial_drivers[] = { &metrousb_device, NULL, }; module_usb_serial_driver(serial_drivers, id_table); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Philip Nicastro"); MODULE_AUTHOR("Aleksey Babahin <tamerlan311@gmail.com>"); MODULE_DESCRIPTION(DRIVER_DESC);
2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0 #include <net/macsec.h> #include "netdevsim.h" static int nsim_macsec_find_secy(struct netdevsim *ns, sci_t sci) { int i; for (i = 0; i < NSIM_MACSEC_MAX_SECY_COUNT; i++) { if (ns->macsec.nsim_secy[i].sci == sci) return i; } return -1; } static int nsim_macsec_find_rxsc(struct nsim_secy *ns_secy, sci_t sci) { int i; for (i = 0; i < NSIM_MACSEC_MAX_RXSC_COUNT; i++) { if (ns_secy->nsim_rxsc[i].sci == sci) return i; } return -1; } static int nsim_macsec_add_secy(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); int idx; if (ns->macsec.nsim_secy_count == NSIM_MACSEC_MAX_SECY_COUNT) return -ENOSPC; for (idx = 0; idx < NSIM_MACSEC_MAX_SECY_COUNT; idx++) { if (!ns->macsec.nsim_secy[idx].used) break; } if (idx == NSIM_MACSEC_MAX_SECY_COUNT) { netdev_err(ctx->netdev, "%s: nsim_secy_count not full but all SecYs used\n", __func__); return -ENOSPC; } netdev_dbg(ctx->netdev, "%s: adding new secy with sci %016llx at index %d\n", __func__, sci_to_cpu(ctx->secy->sci), idx); ns->macsec.nsim_secy[idx].used = true; ns->macsec.nsim_secy[idx].nsim_rxsc_count = 0; ns->macsec.nsim_secy[idx].sci = ctx->secy->sci; ns->macsec.nsim_secy_count++; return 0; } static int nsim_macsec_upd_secy(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: updating secy with sci %016llx at index %d\n", __func__, sci_to_cpu(ctx->secy->sci), idx); return 0; } static int nsim_macsec_del_secy(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: removing SecY with SCI %016llx at index %d\n", __func__, sci_to_cpu(ctx->secy->sci), idx); ns->macsec.nsim_secy[idx].used = false; memset(&ns->macsec.nsim_secy[idx], 0, sizeof(ns->macsec.nsim_secy[idx])); ns->macsec.nsim_secy_count--; return 0; } static int nsim_macsec_add_rxsc(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); struct nsim_secy *secy; int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } secy = &ns->macsec.nsim_secy[idx]; if (secy->nsim_rxsc_count == NSIM_MACSEC_MAX_RXSC_COUNT) return -ENOSPC; for (idx = 0; idx < NSIM_MACSEC_MAX_RXSC_COUNT; idx++) { if (!secy->nsim_rxsc[idx].used) break; } if (idx == NSIM_MACSEC_MAX_RXSC_COUNT) netdev_err(ctx->netdev, "%s: nsim_rxsc_count not full but all RXSCs used\n", __func__); netdev_dbg(ctx->netdev, "%s: adding new rxsc with sci %016llx at index %d\n", __func__, sci_to_cpu(ctx->rx_sc->sci), idx); secy->nsim_rxsc[idx].used = true; secy->nsim_rxsc[idx].sci = ctx->rx_sc->sci; secy->nsim_rxsc_count++; return 0; } static int nsim_macsec_upd_rxsc(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); struct nsim_secy *secy; int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } secy = &ns->macsec.nsim_secy[idx]; idx = nsim_macsec_find_rxsc(secy, ctx->rx_sc->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in RXSC table\n", __func__, sci_to_cpu(ctx->rx_sc->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: updating RXSC with sci %016llx at index %d\n", __func__, sci_to_cpu(ctx->rx_sc->sci), idx); return 0; } static int nsim_macsec_del_rxsc(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); struct nsim_secy *secy; int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } secy = &ns->macsec.nsim_secy[idx]; idx = nsim_macsec_find_rxsc(secy, ctx->rx_sc->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in RXSC table\n", __func__, sci_to_cpu(ctx->rx_sc->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: removing RXSC with sci %016llx at index %d\n", __func__, sci_to_cpu(ctx->rx_sc->sci), idx); secy->nsim_rxsc[idx].used = false; memset(&secy->nsim_rxsc[idx], 0, sizeof(secy->nsim_rxsc[idx])); secy->nsim_rxsc_count--; return 0; } static int nsim_macsec_add_rxsa(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); struct nsim_secy *secy; int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } secy = &ns->macsec.nsim_secy[idx]; idx = nsim_macsec_find_rxsc(secy, ctx->sa.rx_sa->sc->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in RXSC table\n", __func__, sci_to_cpu(ctx->sa.rx_sa->sc->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: RXSC with sci %016llx, AN %u\n", __func__, sci_to_cpu(ctx->sa.rx_sa->sc->sci), ctx->sa.assoc_num); return 0; } static int nsim_macsec_upd_rxsa(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); struct nsim_secy *secy; int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } secy = &ns->macsec.nsim_secy[idx]; idx = nsim_macsec_find_rxsc(secy, ctx->sa.rx_sa->sc->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in RXSC table\n", __func__, sci_to_cpu(ctx->sa.rx_sa->sc->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: RXSC with sci %016llx, AN %u\n", __func__, sci_to_cpu(ctx->sa.rx_sa->sc->sci), ctx->sa.assoc_num); return 0; } static int nsim_macsec_del_rxsa(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); struct nsim_secy *secy; int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } secy = &ns->macsec.nsim_secy[idx]; idx = nsim_macsec_find_rxsc(secy, ctx->sa.rx_sa->sc->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in RXSC table\n", __func__, sci_to_cpu(ctx->sa.rx_sa->sc->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: RXSC with sci %016llx, AN %u\n", __func__, sci_to_cpu(ctx->sa.rx_sa->sc->sci), ctx->sa.assoc_num); return 0; } static int nsim_macsec_add_txsa(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: SECY with sci %016llx, AN %u\n", __func__, sci_to_cpu(ctx->secy->sci), ctx->sa.assoc_num); return 0; } static int nsim_macsec_upd_txsa(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: SECY with sci %016llx, AN %u\n", __func__, sci_to_cpu(ctx->secy->sci), ctx->sa.assoc_num); return 0; } static int nsim_macsec_del_txsa(struct macsec_context *ctx) { struct netdevsim *ns = netdev_priv(ctx->netdev); int idx; idx = nsim_macsec_find_secy(ns, ctx->secy->sci); if (idx < 0) { netdev_err(ctx->netdev, "%s: sci %016llx not found in secy table\n", __func__, sci_to_cpu(ctx->secy->sci)); return -ENOENT; } netdev_dbg(ctx->netdev, "%s: SECY with sci %016llx, AN %u\n", __func__, sci_to_cpu(ctx->secy->sci), ctx->sa.assoc_num); return 0; } static const struct macsec_ops nsim_macsec_ops = { .mdo_add_secy = nsim_macsec_add_secy, .mdo_upd_secy = nsim_macsec_upd_secy, .mdo_del_secy = nsim_macsec_del_secy, .mdo_add_rxsc = nsim_macsec_add_rxsc, .mdo_upd_rxsc = nsim_macsec_upd_rxsc, .mdo_del_rxsc = nsim_macsec_del_rxsc, .mdo_add_rxsa = nsim_macsec_add_rxsa, .mdo_upd_rxsa = nsim_macsec_upd_rxsa, .mdo_del_rxsa = nsim_macsec_del_rxsa, .mdo_add_txsa = nsim_macsec_add_txsa, .mdo_upd_txsa = nsim_macsec_upd_txsa, .mdo_del_txsa = nsim_macsec_del_txsa, }; void nsim_macsec_init(struct netdevsim *ns) { ns->netdev->macsec_ops = &nsim_macsec_ops; ns->netdev->features |= NETIF_F_HW_MACSEC; memset(&ns->macsec, 0, sizeof(ns->macsec)); } void nsim_macsec_teardown(struct netdevsim *ns) { }
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for some a4tech "special" devices * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby */ /* */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include "hid-ids.h" #define A4_2WHEEL_MOUSE_HACK_7 0x01 #define A4_2WHEEL_MOUSE_HACK_B8 0x02 #define A4_WHEEL_ORIENTATION (HID_UP_GENDESK | 0x000000b8) struct a4tech_sc { unsigned long quirks; unsigned int hw_wheel; __s32 delayed_value; }; static int a4_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct a4tech_sc *a4 = hid_get_drvdata(hdev); if (a4->quirks & A4_2WHEEL_MOUSE_HACK_B8 && usage->hid == A4_WHEEL_ORIENTATION) { /* * We do not want to have this usage mapped to anything as it's * nonstandard and doesn't really behave like an HID report. * It's only selecting the orientation (vertical/horizontal) of * the previous mouse wheel report. The input_events will be * generated once both reports are recorded in a4_event(). */ return -1; } return 0; } static int a4_input_mapped(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct a4tech_sc *a4 = hid_get_drvdata(hdev); if (usage->type == EV_REL && usage->code == REL_WHEEL_HI_RES) { set_bit(REL_HWHEEL, *bit); set_bit(REL_HWHEEL_HI_RES, *bit); } if ((a4->quirks & A4_2WHEEL_MOUSE_HACK_7) && usage->hid == 0x00090007) return -1; return 0; } static int a4_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct a4tech_sc *a4 = hid_get_drvdata(hdev); struct input_dev *input; if (!(hdev->claimed & HID_CLAIMED_INPUT) || !field->hidinput) return 0; input = field->hidinput->input; if (a4->quirks & A4_2WHEEL_MOUSE_HACK_B8) { if (usage->type == EV_REL && usage->code == REL_WHEEL_HI_RES) { a4->delayed_value = value; return 1; } if (usage->hid == A4_WHEEL_ORIENTATION) { input_event(input, EV_REL, value ? REL_HWHEEL : REL_WHEEL, a4->delayed_value); input_event(input, EV_REL, value ? REL_HWHEEL_HI_RES : REL_WHEEL_HI_RES, a4->delayed_value * 120); return 1; } } if ((a4->quirks & A4_2WHEEL_MOUSE_HACK_7) && usage->hid == 0x00090007) { a4->hw_wheel = !!value; return 1; } if (usage->code == REL_WHEEL_HI_RES && a4->hw_wheel) { input_event(input, usage->type, REL_HWHEEL, value); input_event(input, usage->type, REL_HWHEEL_HI_RES, value * 120); return 1; } return 0; } static int a4_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct a4tech_sc *a4; int ret; a4 = devm_kzalloc(&hdev->dev, sizeof(*a4), GFP_KERNEL); if (a4 == NULL) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } a4->quirks = id->driver_data; hid_set_drvdata(hdev, a4); ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); return ret; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); return ret; } return 0; } static const struct hid_device_id a4_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_WCP32PU), .driver_data = A4_2WHEEL_MOUSE_HACK_7 }, { HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_X5_005D), .driver_data = A4_2WHEEL_MOUSE_HACK_B8 }, { HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_RP_649), .driver_data = A4_2WHEEL_MOUSE_HACK_B8 }, { HID_USB_DEVICE(USB_VENDOR_ID_A4TECH, USB_DEVICE_ID_A4TECH_NB_95), .driver_data = A4_2WHEEL_MOUSE_HACK_B8 }, { } }; MODULE_DEVICE_TABLE(hid, a4_devices); static struct hid_driver a4_driver = { .name = "a4tech", .id_table = a4_devices, .input_mapping = a4_input_mapping, .input_mapped = a4_input_mapped, .event = a4_event, .probe = a4_probe, }; module_hid_driver(a4_driver); MODULE_DESCRIPTION("HID driver for some a4tech \"special\" devices"); MODULE_LICENSE("GPL");
1 1 1 1 5 1 1 9 9 28 1 1 4 1 1 3 2 11 8 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 /* HIDP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/compat.h> #include <linux/export.h> #include <linux/file.h> #include "hidp.h" static struct bt_sock_list hidp_sk_list = { .lock = __RW_LOCK_UNLOCKED(hidp_sk_list.lock) }; static int hidp_sock_release(struct socket *sock) { struct sock *sk = sock->sk; BT_DBG("sock %p sk %p", sock, sk); if (!sk) return 0; bt_sock_unlink(&hidp_sk_list, sk); sock_orphan(sk); sock_put(sk); return 0; } static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct hidp_connadd_req ca; struct hidp_conndel_req cd; struct hidp_connlist_req cl; struct hidp_conninfo ci; struct socket *csock; struct socket *isock; int err; BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case HIDPCONNADD: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; csock = sockfd_lookup(ca.ctrl_sock, &err); if (!csock) return err; isock = sockfd_lookup(ca.intr_sock, &err); if (!isock) { sockfd_put(csock); return err; } ca.name[sizeof(ca.name)-1] = 0; err = hidp_connection_add(&ca, csock, isock); if (!err && copy_to_user(argp, &ca, sizeof(ca))) err = -EFAULT; sockfd_put(csock); sockfd_put(isock); return err; case HIDPCONNDEL: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; return hidp_connection_del(&cd); case HIDPGETCONNLIST: if (copy_from_user(&cl, argp, sizeof(cl))) return -EFAULT; if (cl.cnum <= 0) return -EINVAL; err = hidp_get_connlist(&cl); if (!err && copy_to_user(argp, &cl, sizeof(cl))) return -EFAULT; return err; case HIDPGETCONNINFO: if (copy_from_user(&ci, argp, sizeof(ci))) return -EFAULT; err = hidp_get_conninfo(&ci); if (!err && copy_to_user(argp, &ci, sizeof(ci))) return -EFAULT; return err; } return -EINVAL; } static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return do_hidp_sock_ioctl(sock, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT struct compat_hidp_connadd_req { int ctrl_sock; /* Connected control socket */ int intr_sock; /* Connected interrupt socket */ __u16 parser; __u16 rd_size; compat_uptr_t rd_data; __u8 country; __u8 subclass; __u16 vendor; __u16 product; __u16 version; __u32 flags; __u32 idle_to; char name[128]; }; static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int err; if (cmd == HIDPGETCONNLIST) { struct hidp_connlist_req cl; u32 __user *p = argp; u32 uci; if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); if (cl.cnum <= 0) return -EINVAL; err = hidp_get_connlist(&cl); if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } else if (cmd == HIDPCONNADD) { struct compat_hidp_connadd_req ca32; struct hidp_connadd_req ca; struct socket *csock; struct socket *isock; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ca32, (void __user *) arg, sizeof(ca32))) return -EFAULT; ca.ctrl_sock = ca32.ctrl_sock; ca.intr_sock = ca32.intr_sock; ca.parser = ca32.parser; ca.rd_size = ca32.rd_size; ca.rd_data = compat_ptr(ca32.rd_data); ca.country = ca32.country; ca.subclass = ca32.subclass; ca.vendor = ca32.vendor; ca.product = ca32.product; ca.version = ca32.version; ca.flags = ca32.flags; ca.idle_to = ca32.idle_to; ca32.name[sizeof(ca32.name) - 1] = '\0'; memcpy(ca.name, ca32.name, 128); csock = sockfd_lookup(ca.ctrl_sock, &err); if (!csock) return err; isock = sockfd_lookup(ca.intr_sock, &err); if (!isock) { sockfd_put(csock); return err; } err = hidp_connection_add(&ca, csock, isock); if (!err && copy_to_user(argp, &ca32, sizeof(ca32))) err = -EFAULT; sockfd_put(csock); sockfd_put(isock); return err; } return hidp_sock_ioctl(sock, cmd, arg); } #endif static const struct proto_ops hidp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hidp_sock_release, .ioctl = hidp_sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = hidp_sock_compat_ioctl, #endif .bind = sock_no_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static struct proto hidp_proto = { .name = "HIDP", .owner = THIS_MODULE, .obj_size = sizeof(struct bt_sock) }; static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sk = bt_sock_alloc(net, sock, &hidp_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sock->ops = &hidp_sock_ops; sock->state = SS_UNCONNECTED; bt_sock_link(&hidp_sk_list, sk); return 0; } static const struct net_proto_family hidp_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = hidp_sock_create }; int __init hidp_init_sockets(void) { int err; err = proto_register(&hidp_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_HIDP, &hidp_sock_family_ops); if (err < 0) { BT_ERR("Can't register HIDP socket"); goto error; } err = bt_procfs_init(&init_net, "hidp", &hidp_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create HIDP proc file"); bt_sock_unregister(BTPROTO_HIDP); goto error; } BT_INFO("HIDP socket layer initialized"); return 0; error: proto_unregister(&hidp_proto); return err; } void __exit hidp_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "hidp"); bt_sock_unregister(BTPROTO_HIDP); proto_unregister(&hidp_proto); }
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> /* * 32 bit structures for IA32 support. */ #include <uapi/asm/sigcontext.h> /* signal.h */ struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; struct sigcontext_32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; /* This matches struct stat64 in glibc2.2, hence the absolutely * insane amounts of padding around dev_t's. */ struct stat64 { unsigned long long st_dev; unsigned char __pad0[4]; #define STAT64_HAS_BROKEN_ST_INO 1 unsigned int __st_ino; unsigned int st_mode; unsigned int st_nlink; unsigned int st_uid; unsigned int st_gid; unsigned long long st_rdev; unsigned char __pad3[4]; long long st_size; unsigned int st_blksize; long long st_blocks;/* Number 512-byte blocks allocated */ unsigned st_atime; unsigned st_atime_nsec; unsigned st_mtime; unsigned st_mtime_nsec; unsigned st_ctime; unsigned st_ctime_nsec; unsigned long long st_ino; } __attribute__((packed)); extern bool __ia32_enabled; static __always_inline bool ia32_enabled(void) { return __ia32_enabled; } static inline void ia32_disable(void) { __ia32_enabled = false; } #else /* !CONFIG_IA32_EMULATION */ static __always_inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } static inline void ia32_disable(void) {} #endif static inline bool ia32_enabled_verbose(void) { bool enabled = ia32_enabled(); if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); return enabled; } #endif /* _ASM_X86_IA32_H */
187 188 620 582 9 2 2 4 3 10 2 4 4 604 541 544 285 26 20 1 1 276 254 2 21 5 251 3 254 25 72 172 99 100 99 1 103 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 IBM Corporation * * Author: * Mimi Zohar <zohar@us.ibm.com> */ #include <linux/module.h> #include <linux/init.h> #include <linux/file.h> #include <linux/binfmts.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/magic.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsverity.h> #include <keys/system_keyring.h> #include <uapi/linux/fsverity.h> #include "ima.h" #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM static char *ima_appraise_cmdline_default __initdata; core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; if (!str) return; if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) appraisal_state = IMA_APPRAISE_LOG; else if (strncmp(str, "fix", 3) == 0) appraisal_state = IMA_APPRAISE_FIX; else if (strncmp(str, "enforce", 7) == 0) appraisal_state = IMA_APPRAISE_ENFORCE; else pr_err("invalid \"%s\" appraise option", str); /* If appraisal state was changed, but secure boot is enabled, * keep its default */ if (sb_state) { if (!(appraisal_state & IMA_APPRAISE_ENFORCE)) pr_info("Secure boot enabled: ignoring ima_appraise=%s option", str); } else { ima_appraise = appraisal_state; } } #endif /* * is_ima_appraise_enabled - return appraise status * * Only return enabled, if not in ima_appraise="fix" or "log" modes. */ bool is_ima_appraise_enabled(void) { return ima_appraise & IMA_APPRAISE_ENFORCE; } /* * ima_must_appraise - set appraise flag * * Return 1 to appraise or hash */ int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { struct lsm_prop prop; if (!ima_appraise) return 0; security_current_getlsmprop_subj(&prop); return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint) { int rc, offset; u8 algo = iint->ima_hash->algo; if (algo <= HASH_ALGO_SHA1) { offset = 1; iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST; } else { offset = 0; iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG; iint->ima_hash->xattr.ng.algo = algo; } rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, &iint->ima_hash->xattr.data[offset], (sizeof(iint->ima_hash->xattr) - offset) + iint->ima_hash->length, 0); return rc; } /* Return specific func appraised cached result */ enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return iint->ima_mmap_status; case BPRM_CHECK: return iint->ima_bprm_status; case CREDS_CHECK: return iint->ima_creds_status; case FILE_CHECK: case POST_SETATTR: return iint->ima_file_status; case MODULE_CHECK ... MAX_CHECK - 1: default: return iint->ima_read_status; } } static void ima_set_cache_status(struct ima_iint_cache *iint, enum ima_hooks func, enum integrity_status status) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->ima_mmap_status = status; break; case BPRM_CHECK: iint->ima_bprm_status = status; break; case CREDS_CHECK: iint->ima_creds_status = status; break; case FILE_CHECK: case POST_SETATTR: iint->ima_file_status = status; break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->ima_read_status = status; break; } } static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func) { switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED); break; case BPRM_CHECK: iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED); break; case CREDS_CHECK: iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED); break; case FILE_CHECK: case POST_SETATTR: iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED); break; case MODULE_CHECK ... MAX_CHECK - 1: default: iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED); break; } } enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value, int xattr_len) { struct signature_v2_hdr *sig; enum hash_algo ret; if (!xattr_value || xattr_len < 2) /* return default hash algo */ return ima_hash_algo; switch (xattr_value->type) { case IMA_VERITY_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 3 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case EVM_IMA_XATTR_DIGSIG: sig = (typeof(sig))xattr_value; if (sig->version != 2 || xattr_len <= sizeof(*sig) || sig->hash_algo >= HASH_ALGO__LAST) return ima_hash_algo; return sig->hash_algo; case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ ret = xattr_value->data[0]; if (ret < HASH_ALGO__LAST) return ret; break; case IMA_XATTR_DIGEST: /* this is for backward compatibility */ if (xattr_len == 21) { unsigned int zero = 0; if (!memcmp(&xattr_value->data[16], &zero, 4)) return HASH_ALGO_MD5; else return HASH_ALGO_SHA1; } else if (xattr_len == 17) return HASH_ALGO_MD5; break; } /* return default hash algo */ return ima_hash_algo; } int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value, int xattr_len) { int ret; ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA, (char **)xattr_value, xattr_len, GFP_NOFS); if (ret == -EOPNOTSUPP) ret = 0; return ret; } /* * calc_file_id_hash - calculate the hash of the ima_file_id struct data * @type: xattr type [enum evm_ima_xattr_type] * @algo: hash algorithm [enum hash_algo] * @digest: pointer to the digest to be hashed * @hash: (out) pointer to the hash * * IMA signature version 3 disambiguates the data that is signed by * indirectly signing the hash of the ima_file_id structure data. * * Signing the ima_file_id struct is currently only supported for * IMA_VERITY_DIGSIG type xattrs. * * Return 0 on success, error code otherwise. */ static int calc_file_id_hash(enum evm_ima_xattr_type type, enum hash_algo algo, const u8 *digest, struct ima_digest_data *hash) { struct ima_file_id file_id = { .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo}; unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo]; if (type != IMA_VERITY_DIGSIG) return -EINVAL; memcpy(file_id.hash, digest, hash_digest_size[algo]); hash->algo = algo; hash->length = hash_digest_size[algo]; return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash); } /* * xattr_verify - verify xattr digest or signature * * Verify whether the hash or signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint, struct evm_ima_xattr_data *xattr_value, int xattr_len, enum integrity_status *status, const char **cause) { struct ima_max_digest_data hash; struct signature_v2_hdr *sig; int rc = -EINVAL, hash_start = 0; int mask; switch (xattr_value->type) { case IMA_XATTR_DIGEST_NG: /* first byte contains algorithm id */ hash_start = 1; fallthrough; case IMA_XATTR_DIGEST: if (*status != INTEGRITY_PASS_IMMUTABLE) { if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) *cause = "verity-signature-required"; else *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } clear_bit(IMA_DIGSIG, &iint->atomic_flags); } else { set_bit(IMA_DIGSIG, &iint->atomic_flags); } if (xattr_len - sizeof(xattr_value->type) - hash_start >= iint->ima_hash->length) /* * xattr length may be longer. md5 hash in previous * version occupied 20 bytes in xattr, instead of 16 */ rc = memcmp(&xattr_value->data[hash_start], iint->ima_hash->digest, iint->ima_hash->length); else rc = -EINVAL; if (rc) { *cause = "invalid-hash"; *status = INTEGRITY_FAIL; break; } *status = INTEGRITY_PASS; break; case EVM_IMA_XATTR_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED; if ((iint->flags & mask) == mask) { *cause = "verity-signature-required"; *status = INTEGRITY_FAIL; break; } sig = (typeof(sig))xattr_value; if (sig->version >= 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc == -EOPNOTSUPP) { *status = INTEGRITY_UNKNOWN; break; } if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM, (const char *)xattr_value, xattr_len, iint->ima_hash->digest, iint->ima_hash->length); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; case IMA_VERITY_DIGSIG: set_bit(IMA_DIGSIG, &iint->atomic_flags); if (iint->flags & IMA_DIGSIG_REQUIRED) { if (!(iint->flags & IMA_VERITY_REQUIRED)) { *cause = "IMA-signature-required"; *status = INTEGRITY_FAIL; break; } } sig = (typeof(sig))xattr_value; if (sig->version != 3) { *cause = "invalid-signature-version"; *status = INTEGRITY_FAIL; break; } rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo, iint->ima_hash->digest, container_of(&hash.hdr, struct ima_digest_data, hdr)); if (rc) { *cause = "sigv3-hashing-error"; *status = INTEGRITY_FAIL; break; } rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA, (const char *)xattr_value, xattr_len, hash.digest, hash.hdr.length); if (rc) { *cause = "invalid-verity-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } break; default: *status = INTEGRITY_UNKNOWN; *cause = "unknown-ima-data"; break; } return rc; } /* * modsig_verify - verify modsig signature * * Verify whether the signature matches the file contents. * * Return 0 on success, error code otherwise. */ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, enum integrity_status *status, const char **cause) { int rc; rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig); if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc && func == KEXEC_KERNEL_CHECK) rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM, modsig); if (rc) { *cause = "invalid-signature"; *status = INTEGRITY_FAIL; } else { *status = INTEGRITY_PASS; } return rc; } /* * ima_check_blacklist - determine if the binary is blacklisted. * * Add the hash of the blacklisted binary to the measurement list, based * on policy. * * Returns -EPERM if the hash is blacklisted. */ int ima_check_blacklist(struct ima_iint_cache *iint, const struct modsig *modsig, int pcr) { enum hash_algo hash_algo; const u8 *digest = NULL; u32 digestsize = 0; int rc = 0; if (!(iint->flags & IMA_CHECK_BLACKLIST)) return 0; if (iint->flags & IMA_MODSIG_ALLOWED && modsig) { ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); rc = is_binary_blacklisted(digest, digestsize); } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash) rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length); if ((rc == -EPERM) && (iint->flags & IMA_MEASURE)) process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize, "blacklisted-hash", NONE, pcr, NULL, false, NULL, 0); return rc; } static bool is_bprm_creds_for_exec(enum ima_hooks func, struct file *file) { struct linux_binprm *bprm; if (func == BPRM_CHECK) { bprm = container_of(&file, struct linux_binprm, file); return bprm->is_check; } return false; } /* * ima_appraise_measurement - appraise file measurement * * Call evm_verifyxattr() to verify the integrity of 'security.ima'. * Assuming success, compare the xattr hash with the collected measurement. * * Return 0 on success, error code otherwise */ int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig) { static const char op[] = "appraise_data"; int audit_msgno = AUDIT_INTEGRITY_DATA; const char *cause = "unknown"; struct dentry *dentry = file_dentry(file); struct inode *inode = d_backing_inode(dentry); enum integrity_status status = INTEGRITY_UNKNOWN; int rc = xattr_len; bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig; /* If not appraising a modsig, we need an xattr. */ if (!(inode->i_opflags & IOP_XATTR) && !try_modsig) return INTEGRITY_UNKNOWN; /* * Unlike any of the other LSM hooks where the kernel enforces file * integrity, enforcing file integrity for the bprm_creds_for_exec() * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion * of the script interpreter(userspace). Differentiate kernel and * userspace enforced integrity audit messages. */ if (is_bprm_creds_for_exec(func, file)) audit_msgno = AUDIT_INTEGRITY_USERSPACE; /* If reading the xattr failed and there's no modsig, error out. */ if (rc <= 0 && !try_modsig) { if (rc && rc != -ENODATA) goto out; if (iint->flags & IMA_DIGSIG_REQUIRED) { if (iint->flags & IMA_VERITY_REQUIRED) cause = "verity-signature-required"; else cause = "IMA-signature-required"; } else { cause = "missing-hash"; } status = INTEGRITY_NOLABEL; if (file->f_mode & FMODE_CREATED) iint->flags |= IMA_NEW_FILE; if ((iint->flags & IMA_NEW_FILE) && (!(iint->flags & IMA_DIGSIG_REQUIRED) || (inode->i_size == 0))) status = INTEGRITY_PASS; goto out; } status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value, rc < 0 ? 0 : rc); switch (status) { case INTEGRITY_PASS: case INTEGRITY_PASS_IMMUTABLE: case INTEGRITY_UNKNOWN: break; case INTEGRITY_NOXATTRS: /* No EVM protected xattrs. */ /* It's fine not to have xattrs when using a modsig. */ if (try_modsig) break; fallthrough; case INTEGRITY_NOLABEL: /* No security.evm xattr. */ cause = "missing-HMAC"; goto out; case INTEGRITY_FAIL_IMMUTABLE: set_bit(IMA_DIGSIG, &iint->atomic_flags); cause = "invalid-fail-immutable"; goto out; case INTEGRITY_FAIL: /* Invalid HMAC/signature. */ cause = "invalid-HMAC"; goto out; default: WARN_ONCE(true, "Unexpected integrity status %d\n", status); } if (xattr_value) rc = xattr_verify(func, iint, xattr_value, xattr_len, &status, &cause); /* * If we have a modsig and either no imasig or the imasig's key isn't * known, then try verifying the modsig. */ if (try_modsig && (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG || rc == -ENOKEY)) rc = modsig_verify(func, modsig, &status, &cause); out: /* * File signatures on some filesystems can not be properly verified. * When such filesystems are mounted by an untrusted mounter or on a * system not willing to accept such a risk, fail the file signature * verification. */ if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) && ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) || (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) { status = INTEGRITY_FAIL; cause = "unverifiable-signature"; integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else if (status != INTEGRITY_PASS) { /* Fix mode, but don't replace file signatures. */ if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig && (!xattr_value || xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; } /* * Permit new files with file/EVM portable signatures, but * without data. */ if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE && test_bit(IMA_DIGSIG, &iint->atomic_flags)) { status = INTEGRITY_PASS; } integrity_audit_msg(audit_msgno, inode, filename, op, cause, rc, 0); } else { ima_cache_flags(iint, func); } ima_set_cache_status(iint, func, status); return status; } /* * ima_update_xattr - update 'security.ima' hash value */ void ima_update_xattr(struct ima_iint_cache *iint, struct file *file) { struct dentry *dentry = file_dentry(file); int rc = 0; /* do not collect and update hash for digital signatures */ if (test_bit(IMA_DIGSIG, &iint->atomic_flags)) return; if ((iint->ima_file_status != INTEGRITY_PASS) && !(iint->flags & IMA_HASH)) return; rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL); if (rc < 0) return; inode_lock(file_inode(file)); ima_fix_xattr(dentry, iint); inode_unlock(file_inode(file)); } /** * ima_inode_post_setattr - reflect file metadata changes * @idmap: idmap of the mount the inode was found from * @dentry: pointer to the affected dentry * @ia_valid: for the UID and GID status * * Changes to a dentry's metadata might result in needing to appraise. * * This function is called from notify_change(), which expects the caller * to lock the inode's i_mutex. */ static void ima_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { struct inode *inode = d_backing_inode(dentry); struct ima_iint_cache *iint; int action; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) return; action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR); iint = ima_iint_find(inode); if (iint) { set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags); if (!action) clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags); } } /* * ima_protect_xattr - protect 'security.ima' * * Ensure that not just anyone can modify or remove 'security.ima'. */ static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len) { if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 1; } return 0; } static void ima_reset_appraise_flags(struct inode *inode, int digsig) { struct ima_iint_cache *iint; if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find(inode); if (!iint) return; iint->measured_pcrs = 0; set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags); if (digsig) set_bit(IMA_DIGSIG, &iint->atomic_flags); else clear_bit(IMA_DIGSIG, &iint->atomic_flags); } /** * validate_hash_algo() - Block setxattr with unsupported hash algorithms * @dentry: object of the setxattr() * @xattr_value: userland supplied xattr value * @xattr_value_len: length of xattr_value * * The xattr value is mapped to its hash algorithm, and this algorithm * must be built in the kernel for the setxattr to be allowed. * * Emit an audit message when the algorithm is invalid. * * Return: 0 on success, else an error. */ static int validate_hash_algo(struct dentry *dentry, const struct evm_ima_xattr_data *xattr_value, size_t xattr_value_len) { char *path = NULL, *pathbuf = NULL; enum hash_algo xattr_hash_algo; const char *errmsg = "unavailable-hash-algorithm"; unsigned int allowed_hashes; xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len); allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms); if (allowed_hashes) { /* success if the algorithm is allowed in the ima policy */ if (allowed_hashes & (1U << xattr_hash_algo)) return 0; /* * We use a different audit message when the hash algorithm * is denied by a policy rule, instead of not being built * in the kernel image */ errmsg = "denied-hash-algorithm"; } else { if (likely(xattr_hash_algo == ima_hash_algo)) return 0; /* allow any xattr using an algorithm built in the kernel */ if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0)) return 0; } pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) return -EACCES; path = dentry_path(dentry, pathbuf, PATH_MAX); integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path, "set_data", errmsg, -EACCES, 0); kfree(pathbuf); return -EACCES; } static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len, int flags) { const struct evm_ima_xattr_data *xvalue = xattr_value; int digsig = 0; int result; int err; result = ima_protect_xattr(dentry, xattr_name, xattr_value, xattr_value_len); if (result == 1) { if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST)) return -EINVAL; err = validate_hash_algo(dentry, xvalue, xattr_value_len); if (err) return err; digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG); } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) { digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG); } if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), digsig); if (result == 1) result = 0; } return result; } static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (evm_revalidate_status(acl_name)) ima_reset_appraise_flags(d_backing_inode(dentry), 0); return 0; } static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) { int result; result = ima_protect_xattr(dentry, xattr_name, NULL, 0); if (result == 1 || evm_revalidate_status(xattr_name)) { ima_reset_appraise_flags(d_backing_inode(dentry), 0); if (result == 1) result = 0; } return result; } static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ima_inode_set_acl(idmap, dentry, acl_name, NULL); } static struct security_hook_list ima_appraise_hooks[] __ro_after_init = { LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr), LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr), LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl), LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr), LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl), }; void __init init_ima_appraise_lsm(const struct lsm_id *lsmid) { security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks), lsmid); }
2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H #include <linux/sizes.h> #include <linux/compiler_types.h> #include "ctree.h" #include "fs.h" struct block_device; struct super_block; struct extent_buffer; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info; struct btrfs_super_block; struct btrfs_trans_handle; struct btrfs_tree_parent_check; struct btrfs_transaction; #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 /* * Fixed blocksize for all devices, applies to specific ways of reading * metadata like superblock. Must meet the set_blocksize requirements. * * Do not change. */ #define BTRFS_BDEV_BLOCKSIZE (4096) static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; if (mirror) return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); return BTRFS_SUPER_INFO_OFFSET; } void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, dev_t *anon_dev); struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); int btrfs_global_root_insert(struct btrfs_root *root); void btrfs_global_root_delete(struct btrfs_root *root); struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); int btrfs_validate_extent_buffer(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { if (!root) return NULL; if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); #endif
64 64 163 186 185 201 141 111 8 25 142 36 122 137 93 61 60 40 47 33 40 4 60 46 113 138 93 100 127 138 138 190 189 97 78 170 93 24 12 16 16 16 188 93 1 93 93 93 93 93 93 2 93 93 4 93 93 93 93 93 1 42 43 42 43 43 43 120 120 120 1 120 120 119 118 14 11 14 6 5 3 12 9 9 75 86 1 86 173 4 174 49 138 170 170 186 184 185 9 23 94 99 99 79 93 81 17 166 167 35 171 1 1 1 1 1 1 1 161 24 157 16 159 3 3 2 2 2 2 2 1 2 119 2 120 44 2 43 94 16 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * This code builds two trees of free clusters extents. * Trees are sorted by start of extent and by length of extent. * NTFS_MAX_WND_EXTENTS defines the maximum number of elements in trees. * In extreme case code reads on-disk bitmap to find free clusters. * */ #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/kernel.h> #include "ntfs.h" #include "ntfs_fs.h" /* * Maximum number of extents in tree. */ #define NTFS_MAX_WND_EXTENTS (32u * 1024u) struct rb_node_key { struct rb_node node; size_t key; }; struct e_node { struct rb_node_key start; /* Tree sorted by start. */ struct rb_node_key count; /* Tree sorted by len. */ }; static int wnd_rescan(struct wnd_bitmap *wnd); static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw); static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits); static struct kmem_cache *ntfs_enode_cachep; int __init ntfs3_init_bitmap(void) { ntfs_enode_cachep = kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0, SLAB_RECLAIM_ACCOUNT, NULL); return ntfs_enode_cachep ? 0 : -ENOMEM; } void ntfs3_exit_bitmap(void) { kmem_cache_destroy(ntfs_enode_cachep); } /* * wnd_scan * * b_pos + b_len - biggest fragment. * Scan range [wpos wbits) window @buf. * * Return: -1 if not found. */ static size_t wnd_scan(const void *buf, size_t wbit, u32 wpos, u32 wend, size_t to_alloc, size_t *prev_tail, size_t *b_pos, size_t *b_len) { while (wpos < wend) { size_t free_len; u32 free_bits, end; u32 used = find_next_zero_bit_le(buf, wend, wpos); if (used >= wend) { if (*b_len < *prev_tail) { *b_pos = wbit - *prev_tail; *b_len = *prev_tail; } *prev_tail = 0; return -1; } if (used > wpos) { wpos = used; if (*b_len < *prev_tail) { *b_pos = wbit - *prev_tail; *b_len = *prev_tail; } *prev_tail = 0; } /* * Now we have a fragment [wpos, wend) staring with 0. */ end = wpos + to_alloc - *prev_tail; free_bits = find_next_bit_le(buf, min(end, wend), wpos); free_len = *prev_tail + free_bits - wpos; if (*b_len < free_len) { *b_pos = wbit + wpos - *prev_tail; *b_len = free_len; } if (free_len >= to_alloc) return wbit + wpos - *prev_tail; if (free_bits >= wend) { *prev_tail += free_bits - wpos; return -1; } wpos = free_bits + 1; *prev_tail = 0; } return -1; } /* * wnd_close - Frees all resources. */ void wnd_close(struct wnd_bitmap *wnd) { struct rb_node *node, *next; kvfree(wnd->free_bits); wnd->free_bits = NULL; run_close(&wnd->run); node = rb_first(&wnd->start_tree); while (node) { next = rb_next(node); rb_erase(node, &wnd->start_tree); kmem_cache_free(ntfs_enode_cachep, rb_entry(node, struct e_node, start.node)); node = next; } } static struct rb_node *rb_lookup(struct rb_root *root, size_t v) { struct rb_node **p = &root->rb_node; struct rb_node *r = NULL; while (*p) { struct rb_node_key *k; k = rb_entry(*p, struct rb_node_key, node); if (v < k->key) { p = &(*p)->rb_left; } else if (v > k->key) { r = &k->node; p = &(*p)->rb_right; } else { return &k->node; } } return r; } /* * rb_insert_count - Helper function to insert special kind of 'count' tree. */ static inline bool rb_insert_count(struct rb_root *root, struct e_node *e) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; size_t e_ckey = e->count.key; size_t e_skey = e->start.key; while (*p) { struct e_node *k = rb_entry(parent = *p, struct e_node, count.node); if (e_ckey > k->count.key) { p = &(*p)->rb_left; } else if (e_ckey < k->count.key) { p = &(*p)->rb_right; } else if (e_skey < k->start.key) { p = &(*p)->rb_left; } else if (e_skey > k->start.key) { p = &(*p)->rb_right; } else { WARN_ON(1); return false; } } rb_link_node(&e->count.node, parent, p); rb_insert_color(&e->count.node, root); return true; } /* * rb_insert_start - Helper function to insert special kind of 'count' tree. */ static inline bool rb_insert_start(struct rb_root *root, struct e_node *e) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; size_t e_skey = e->start.key; while (*p) { struct e_node *k; parent = *p; k = rb_entry(parent, struct e_node, start.node); if (e_skey < k->start.key) { p = &(*p)->rb_left; } else if (e_skey > k->start.key) { p = &(*p)->rb_right; } else { WARN_ON(1); return false; } } rb_link_node(&e->start.node, parent, p); rb_insert_color(&e->start.node, root); return true; } /* * wnd_add_free_ext - Adds a new extent of free space. * @build: 1 when building tree. */ static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, bool build) { struct e_node *e, *e0 = NULL; size_t ib, end_in = bit + len; struct rb_node *n; if (build) { /* Use extent_min to filter too short extents. */ if (wnd->count >= NTFS_MAX_WND_EXTENTS && len <= wnd->extent_min) { wnd->uptodated = -1; return; } } else { /* Try to find extent before 'bit'. */ n = rb_lookup(&wnd->start_tree, bit); if (!n) { n = rb_first(&wnd->start_tree); } else { e = rb_entry(n, struct e_node, start.node); n = rb_next(n); if (e->start.key + e->count.key == bit) { /* Remove left. */ bit = e->start.key; len += e->count.key; rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; e0 = e; } } while (n) { size_t next_end; e = rb_entry(n, struct e_node, start.node); next_end = e->start.key + e->count.key; if (e->start.key > end_in) break; /* Remove right. */ n = rb_next(n); len += next_end - end_in; end_in = next_end; rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; if (!e0) e0 = e; else kmem_cache_free(ntfs_enode_cachep, e); } if (wnd->uptodated != 1) { /* Check bits before 'bit'. */ ib = wnd->zone_bit == wnd->zone_end || bit < wnd->zone_end ? 0 : wnd->zone_end; while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { bit -= 1; len += 1; } /* Check bits after 'end_in'. */ ib = wnd->zone_bit == wnd->zone_end || end_in > wnd->zone_bit ? wnd->nbits : wnd->zone_bit; while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { end_in += 1; len += 1; } } } /* Insert new fragment. */ if (wnd->count >= NTFS_MAX_WND_EXTENTS) { if (e0) kmem_cache_free(ntfs_enode_cachep, e0); wnd->uptodated = -1; /* Compare with smallest fragment. */ n = rb_last(&wnd->count_tree); e = rb_entry(n, struct e_node, count.node); if (len <= e->count.key) goto out; /* Do not insert small fragments. */ if (build) { struct e_node *e2; n = rb_prev(n); e2 = rb_entry(n, struct e_node, count.node); /* Smallest fragment will be 'e2->count.key'. */ wnd->extent_min = e2->count.key; } /* Replace smallest fragment by new one. */ rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; } else { e = e0 ? e0 : kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); if (!e) { wnd->uptodated = -1; goto out; } if (build && len <= wnd->extent_min) wnd->extent_min = len; } e->start.key = bit; e->count.key = len; if (len > wnd->extent_max) wnd->extent_max = len; rb_insert_start(&wnd->start_tree, e); rb_insert_count(&wnd->count_tree, e); wnd->count += 1; out:; } /* * wnd_remove_free_ext - Remove a run from the cached free space. */ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) { struct rb_node *n, *n3; struct e_node *e, *e3; size_t end_in = bit + len; size_t end3, end, new_key, new_len, max_new_len; /* Try to find extent before 'bit'. */ n = rb_lookup(&wnd->start_tree, bit); if (!n) return; e = rb_entry(n, struct e_node, start.node); end = e->start.key + e->count.key; new_key = new_len = 0; len = e->count.key; /* Range [bit,end_in) must be inside 'e' or outside 'e' and 'n'. */ if (e->start.key > bit) ; else if (end_in <= end) { /* Range [bit,end_in) inside 'e'. */ new_key = end_in; new_len = end - end_in; len = bit - e->start.key; } else if (bit > end) { bool bmax = false; n3 = rb_next(n); while (n3) { e3 = rb_entry(n3, struct e_node, start.node); if (e3->start.key >= end_in) break; if (e3->count.key == wnd->extent_max) bmax = true; end3 = e3->start.key + e3->count.key; if (end3 > end_in) { e3->start.key = end_in; rb_erase(&e3->count.node, &wnd->count_tree); e3->count.key = end3 - end_in; rb_insert_count(&wnd->count_tree, e3); break; } n3 = rb_next(n3); rb_erase(&e3->start.node, &wnd->start_tree); rb_erase(&e3->count.node, &wnd->count_tree); wnd->count -= 1; kmem_cache_free(ntfs_enode_cachep, e3); } if (!bmax) return; n3 = rb_first(&wnd->count_tree); wnd->extent_max = n3 ? rb_entry(n3, struct e_node, count.node)->count.key : 0; return; } if (e->count.key != wnd->extent_max) { ; } else if (rb_prev(&e->count.node)) { ; } else { n3 = rb_next(&e->count.node); max_new_len = max(len, new_len); if (!n3) { wnd->extent_max = max_new_len; } else { e3 = rb_entry(n3, struct e_node, count.node); wnd->extent_max = max(e3->count.key, max_new_len); } } if (!len) { if (new_len) { e->start.key = new_key; rb_erase(&e->count.node, &wnd->count_tree); e->count.key = new_len; rb_insert_count(&wnd->count_tree, e); } else { rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; kmem_cache_free(ntfs_enode_cachep, e); } goto out; } rb_erase(&e->count.node, &wnd->count_tree); e->count.key = len; rb_insert_count(&wnd->count_tree, e); if (!new_len) goto out; if (wnd->count >= NTFS_MAX_WND_EXTENTS) { wnd->uptodated = -1; /* Get minimal extent. */ e = rb_entry(rb_last(&wnd->count_tree), struct e_node, count.node); if (e->count.key > new_len) goto out; /* Replace minimum. */ rb_erase(&e->start.node, &wnd->start_tree); rb_erase(&e->count.node, &wnd->count_tree); wnd->count -= 1; } else { e = kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); if (!e) wnd->uptodated = -1; } if (e) { e->start.key = new_key; e->count.key = new_len; rb_insert_start(&wnd->start_tree, e); rb_insert_count(&wnd->count_tree, e); wnd->count += 1; } out: if (!wnd->count && 1 != wnd->uptodated) wnd_rescan(wnd); } /* * wnd_rescan - Scan all bitmap. Used while initialization. */ static int wnd_rescan(struct wnd_bitmap *wnd) { int err = 0; size_t prev_tail = 0; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi = sb->s_fs_info; u64 lbo, len = 0; u32 blocksize = sb->s_blocksize; u8 cluster_bits = sbi->cluster_bits; u32 wbits = 8 * sb->s_blocksize; u32 used, frb; size_t wpos, wbit, iw, vbo; struct buffer_head *bh = NULL; CLST lcn, clen; wnd->uptodated = 0; wnd->extent_max = 0; wnd->extent_min = MINUS_ONE_T; wnd->total_zeroes = 0; vbo = 0; for (iw = 0; iw < wnd->nwnd; iw++) { if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; if (wnd->inited) { if (!wnd->free_bits[iw]) { /* All ones. */ if (prev_tail) { wnd_add_free_ext(wnd, vbo * 8 - prev_tail, prev_tail, true); prev_tail = 0; } goto next_wnd; } if (wbits == wnd->free_bits[iw]) { /* All zeroes. */ prev_tail += wbits; wnd->total_zeroes += wbits; goto next_wnd; } } if (!len) { u32 off = vbo & sbi->cluster_mask; if (!run_lookup_entry(&wnd->run, vbo >> cluster_bits, &lcn, &clen, NULL)) { err = -ENOENT; goto out; } lbo = ((u64)lcn << cluster_bits) + off; len = ((u64)clen << cluster_bits) - off; } bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); if (!bh) { err = -EIO; goto out; } used = ntfs_bitmap_weight_le(bh->b_data, wbits); if (used < wbits) { frb = wbits - used; wnd->free_bits[iw] = frb; wnd->total_zeroes += frb; } wpos = 0; wbit = vbo * 8; if (wbit + wbits > wnd->nbits) wbits = wnd->nbits - wbit; do { used = find_next_zero_bit_le(bh->b_data, wbits, wpos); if (used > wpos && prev_tail) { wnd_add_free_ext(wnd, wbit + wpos - prev_tail, prev_tail, true); prev_tail = 0; } wpos = used; if (wpos >= wbits) { /* No free blocks. */ prev_tail = 0; break; } frb = find_next_bit_le(bh->b_data, wbits, wpos); if (frb >= wbits) { /* Keep last free block. */ prev_tail += frb - wpos; break; } wnd_add_free_ext(wnd, wbit + wpos - prev_tail, frb + prev_tail - wpos, true); /* Skip free block and first '1'. */ wpos = frb + 1; /* Reset previous tail. */ prev_tail = 0; } while (wpos < wbits); next_wnd: if (bh) put_bh(bh); bh = NULL; vbo += blocksize; if (len) { len -= blocksize; lbo += blocksize; } } /* Add last block. */ if (prev_tail) wnd_add_free_ext(wnd, wnd->nbits - prev_tail, prev_tail, true); /* * Before init cycle wnd->uptodated was 0. * If any errors or limits occurs while initialization then * wnd->uptodated will be -1. * If 'uptodated' is still 0 then Tree is really updated. */ if (!wnd->uptodated) wnd->uptodated = 1; if (wnd->zone_bit != wnd->zone_end) { size_t zlen = wnd->zone_end - wnd->zone_bit; wnd->zone_end = wnd->zone_bit; wnd_zone_set(wnd, wnd->zone_bit, zlen); } out: return err; } int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) { int err; u32 blocksize = sb->s_blocksize; u32 wbits = blocksize * 8; init_rwsem(&wnd->rw_lock); wnd->sb = sb; wnd->nbits = nbits; wnd->total_zeroes = nbits; wnd->extent_max = MINUS_ONE_T; wnd->zone_bit = wnd->zone_end = 0; wnd->nwnd = bytes_to_block(sb, ntfs3_bitmap_size(nbits)); wnd->bits_last = nbits & (wbits - 1); if (!wnd->bits_last) wnd->bits_last = wbits; wnd->free_bits = kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); if (!wnd->free_bits) return -ENOMEM; err = wnd_rescan(wnd); if (err) return err; wnd->inited = true; return 0; } /* * wnd_map - Call sb_bread for requested window. */ static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw) { size_t vbo; CLST lcn, clen; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi; struct buffer_head *bh; u64 lbo; sbi = sb->s_fs_info; vbo = (u64)iw << sb->s_blocksize_bits; if (!run_lookup_entry(&wnd->run, vbo >> sbi->cluster_bits, &lcn, &clen, NULL)) { return ERR_PTR(-ENOENT); } lbo = ((u64)lcn << sbi->cluster_bits) + (vbo & sbi->cluster_mask); bh = ntfs_bread(wnd->sb, lbo >> sb->s_blocksize_bits); if (!bh) return ERR_PTR(-EIO); return bh; } /* * wnd_set_free - Mark the bits range from bit to bit + bits as free. */ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; u32 wbits = 8 * sb->s_blocksize; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbit = bit & (wbits - 1); struct buffer_head *bh; u32 op; for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } lock_buffer(bh); ntfs_bitmap_clear_le(bh->b_data, wbit, op); wnd->free_bits[iw] += op; wnd->total_zeroes += op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); wnd_add_free_ext(wnd, bit, op, false); } return err; } /* * wnd_set_used - Mark the bits range from bit to bit + bits as used. */ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) { int err = 0; struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); struct buffer_head *bh; u32 op; for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } lock_buffer(bh); ntfs_bitmap_set_le(bh->b_data, wbit, op); wnd->free_bits[iw] -= op; wnd->total_zeroes -= op; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); put_bh(bh); if (!RB_EMPTY_ROOT(&wnd->start_tree)) wnd_remove_free_ext(wnd, bit, op); } return err; } /* * wnd_set_used_safe - Mark the bits range from bit to bit + bits as used. * * Unlikely wnd_set_used/wnd_set_free this function is not full trusted. * It scans every bit in bitmap and marks free bit as used. * @done - how many bits were marked as used. * * NOTE: normally *done should be 0. */ int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits, size_t *done) { size_t i, from = 0, len = 0; int err = 0; *done = 0; for (i = 0; i < bits; i++) { if (wnd_is_free(wnd, bit + i, 1)) { if (!len) from = bit + i; len += 1; } else if (len) { err = wnd_set_used(wnd, from, len); *done += len; len = 0; if (err) break; } } if (len) { /* last fragment. */ err = wnd_set_used(wnd, from, len); *done += len; } return err; } /* * wnd_is_free_hlp * * Return: True if all clusters [bit, bit+bits) are free (bitmap only). */ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) { struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); u32 op; for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); if (wbits != wnd->free_bits[iw]) { bool ret; struct buffer_head *bh = wnd_map(wnd, iw); if (IS_ERR(bh)) return false; ret = are_bits_clear(bh->b_data, wbit, op); put_bh(bh); if (!ret) return false; } } return true; } /* * wnd_is_free * * Return: True if all clusters [bit, bit+bits) are free. */ bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) { bool ret; struct rb_node *n; size_t end; struct e_node *e; if (RB_EMPTY_ROOT(&wnd->start_tree)) goto use_wnd; n = rb_lookup(&wnd->start_tree, bit); if (!n) goto use_wnd; e = rb_entry(n, struct e_node, start.node); end = e->start.key + e->count.key; if (bit < end && bit + bits <= end) return true; use_wnd: ret = wnd_is_free_hlp(wnd, bit, bits); return ret; } /* * wnd_is_used * * Return: True if all clusters [bit, bit+bits) are used. */ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) { bool ret = false; struct super_block *sb = wnd->sb; size_t iw = bit >> (sb->s_blocksize_bits + 3); u32 wbits = 8 * sb->s_blocksize; u32 wbit = bit & (wbits - 1); u32 op; size_t end; struct rb_node *n; struct e_node *e; if (RB_EMPTY_ROOT(&wnd->start_tree)) goto use_wnd; end = bit + bits; n = rb_lookup(&wnd->start_tree, end - 1); if (!n) goto use_wnd; e = rb_entry(n, struct e_node, start.node); if (e->start.key + e->count.key > bit) return false; use_wnd: for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { if (unlikely(iw + 1 == wnd->nwnd)) wbits = wnd->bits_last; op = min_t(u32, wbits - wbit, bits); if (wnd->free_bits[iw]) { bool ret; struct buffer_head *bh = wnd_map(wnd, iw); if (IS_ERR(bh)) goto out; ret = are_bits_set(bh->b_data, wbit, op); put_bh(bh); if (!ret) goto out; } } ret = true; out: return ret; } /* * wnd_find - Look for free space. * * - flags - BITMAP_FIND_XXX flags * * Return: 0 if not found. */ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, size_t flags, size_t *allocated) { struct super_block *sb; u32 wbits, wpos, wzbit, wzend; size_t fnd, max_alloc, b_len, b_pos; size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend; size_t to_alloc0 = to_alloc; const struct e_node *e; const struct rb_node *pr, *cr; u8 log2_bits; bool fbits_valid; struct buffer_head *bh; /* Fast checking for available free space. */ if (flags & BITMAP_FIND_FULL) { size_t zeroes = wnd_zeroes(wnd); zeroes -= wnd->zone_end - wnd->zone_bit; if (zeroes < to_alloc0) goto no_space; if (to_alloc0 > wnd->extent_max) goto no_space; } else { if (to_alloc > wnd->extent_max) to_alloc = wnd->extent_max; } if (wnd->zone_bit <= hint && hint < wnd->zone_end) hint = wnd->zone_end; max_alloc = wnd->nbits; b_len = b_pos = 0; if (hint >= max_alloc) hint = 0; if (RB_EMPTY_ROOT(&wnd->start_tree)) { if (wnd->uptodated == 1) { /* Extents tree is updated -> No free space. */ goto no_space; } goto scan_bitmap; } e = NULL; if (!hint) goto allocate_biggest; /* Use hint: Enumerate extents by start >= hint. */ pr = NULL; cr = wnd->start_tree.rb_node; for (;;) { e = rb_entry(cr, struct e_node, start.node); if (e->start.key == hint) break; if (e->start.key < hint) { pr = cr; cr = cr->rb_right; if (!cr) break; continue; } cr = cr->rb_left; if (!cr) { e = pr ? rb_entry(pr, struct e_node, start.node) : NULL; break; } } if (!e) goto allocate_biggest; if (e->start.key + e->count.key > hint) { /* We have found extension with 'hint' inside. */ size_t len = e->start.key + e->count.key - hint; if (len >= to_alloc && hint + to_alloc <= max_alloc) { fnd = hint; goto found; } if (!(flags & BITMAP_FIND_FULL)) { if (len > to_alloc) len = to_alloc; if (hint + len <= max_alloc) { fnd = hint; to_alloc = len; goto found; } } } allocate_biggest: /* Allocate from biggest free extent. */ e = rb_entry(rb_first(&wnd->count_tree), struct e_node, count.node); if (e->count.key != wnd->extent_max) wnd->extent_max = e->count.key; if (e->count.key < max_alloc) { if (e->count.key >= to_alloc) { ; } else if (flags & BITMAP_FIND_FULL) { if (e->count.key < to_alloc0) { /* Biggest free block is less then requested. */ goto no_space; } to_alloc = e->count.key; } else if (-1 != wnd->uptodated) { to_alloc = e->count.key; } else { /* Check if we can use more bits. */ size_t op, max_check; struct rb_root start_tree; memcpy(&start_tree, &wnd->start_tree, sizeof(struct rb_root)); memset(&wnd->start_tree, 0, sizeof(struct rb_root)); max_check = e->start.key + to_alloc; if (max_check > max_alloc) max_check = max_alloc; for (op = e->start.key + e->count.key; op < max_check; op++) { if (!wnd_is_free(wnd, op, 1)) break; } memcpy(&wnd->start_tree, &start_tree, sizeof(struct rb_root)); to_alloc = op - e->start.key; } /* Prepare to return. */ fnd = e->start.key; if (e->start.key + to_alloc > max_alloc) to_alloc = max_alloc - e->start.key; goto found; } if (wnd->uptodated == 1) { /* Extents tree is updated -> no free space. */ goto no_space; } b_len = e->count.key; b_pos = e->start.key; scan_bitmap: sb = wnd->sb; log2_bits = sb->s_blocksize_bits + 3; /* At most two ranges [hint, max_alloc) + [0, hint). */ Again: /* TODO: Optimize request for case nbits > wbits. */ iw = hint >> log2_bits; wbits = sb->s_blocksize * 8; wpos = hint & (wbits - 1); prev_tail = 0; fbits_valid = true; if (max_alloc == wnd->nbits) { nwnd = wnd->nwnd; } else { size_t t = max_alloc + wbits - 1; nwnd = likely(t > max_alloc) ? (t >> log2_bits) : wnd->nwnd; } /* Enumerate all windows. */ for (; iw < nwnd; iw++) { wbit = iw << log2_bits; if (!wnd->free_bits[iw]) { if (prev_tail > b_len) { b_pos = wbit - prev_tail; b_len = prev_tail; } /* Skip full used window. */ prev_tail = 0; wpos = 0; continue; } if (unlikely(iw + 1 == nwnd)) { if (max_alloc == wnd->nbits) { wbits = wnd->bits_last; } else { size_t t = max_alloc & (wbits - 1); if (t) { wbits = t; fbits_valid = false; } } } if (wnd->zone_end > wnd->zone_bit) { ebit = wbit + wbits; zbit = max(wnd->zone_bit, wbit); zend = min(wnd->zone_end, ebit); /* Here we have a window [wbit, ebit) and zone [zbit, zend). */ if (zend <= zbit) { /* Zone does not overlap window. */ } else { wzbit = zbit - wbit; wzend = zend - wbit; /* Zone overlaps window. */ if (wnd->free_bits[iw] == wzend - wzbit) { prev_tail = 0; wpos = 0; continue; } /* Scan two ranges window: [wbit, zbit) and [zend, ebit). */ bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { /* TODO: Error */ prev_tail = 0; wpos = 0; continue; } /* Scan range [wbit, zbit). */ if (wpos < wzbit) { /* Scan range [wpos, zbit). */ fnd = wnd_scan(bh->b_data, wbit, wpos, wzbit, to_alloc, &prev_tail, &b_pos, &b_len); if (fnd != MINUS_ONE_T) { put_bh(bh); goto found; } } prev_tail = 0; /* Scan range [zend, ebit). */ if (wzend < wbits) { fnd = wnd_scan(bh->b_data, wbit, max(wzend, wpos), wbits, to_alloc, &prev_tail, &b_pos, &b_len); if (fnd != MINUS_ONE_T) { put_bh(bh); goto found; } } wpos = 0; put_bh(bh); continue; } } /* Current window does not overlap zone. */ if (!wpos && fbits_valid && wnd->free_bits[iw] == wbits) { /* Window is empty. */ if (prev_tail + wbits >= to_alloc) { fnd = wbit + wpos - prev_tail; goto found; } /* Increase 'prev_tail' and process next window. */ prev_tail += wbits; wpos = 0; continue; } /* Read window. */ bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { // TODO: Error. prev_tail = 0; wpos = 0; continue; } /* Scan range [wpos, eBits). */ fnd = wnd_scan(bh->b_data, wbit, wpos, wbits, to_alloc, &prev_tail, &b_pos, &b_len); put_bh(bh); if (fnd != MINUS_ONE_T) goto found; } if (b_len < prev_tail) { /* The last fragment. */ b_len = prev_tail; b_pos = max_alloc - prev_tail; } if (hint) { /* * We have scanned range [hint max_alloc). * Prepare to scan range [0 hint + to_alloc). */ size_t nextmax = hint + to_alloc; if (likely(nextmax >= hint) && nextmax < max_alloc) max_alloc = nextmax; hint = 0; goto Again; } if (!b_len) goto no_space; wnd->extent_max = b_len; if (flags & BITMAP_FIND_FULL) goto no_space; fnd = b_pos; to_alloc = b_len; found: if (flags & BITMAP_FIND_MARK_AS_USED) { /* TODO: Optimize remove extent (pass 'e'?). */ if (wnd_set_used(wnd, fnd, to_alloc)) goto no_space; } else if (wnd->extent_max != MINUS_ONE_T && to_alloc > wnd->extent_max) { wnd->extent_max = to_alloc; } *allocated = fnd; return to_alloc; no_space: return 0; } /* * wnd_extend - Extend bitmap ($MFT bitmap). */ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) { int err; struct super_block *sb = wnd->sb; struct ntfs_sb_info *sbi = sb->s_fs_info; u32 blocksize = sb->s_blocksize; u32 wbits = blocksize * 8; u32 b0, new_last; size_t bits, iw, new_wnd; size_t old_bits = wnd->nbits; u16 *new_free; if (new_bits <= old_bits) return -EINVAL; /* Align to 8 byte boundary. */ new_wnd = bytes_to_block(sb, ntfs3_bitmap_size(new_bits)); new_last = new_bits & (wbits - 1); if (!new_last) new_last = wbits; if (new_wnd != wnd->nwnd) { new_free = kmalloc_array(new_wnd, sizeof(u16), GFP_NOFS); if (!new_free) return -ENOMEM; memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); memset(new_free + wnd->nwnd, 0, (new_wnd - wnd->nwnd) * sizeof(short)); kvfree(wnd->free_bits); wnd->free_bits = new_free; } /* Zero bits [old_bits,new_bits). */ bits = new_bits - old_bits; b0 = old_bits & (wbits - 1); for (iw = old_bits >> (sb->s_blocksize_bits + 3); bits; iw += 1) { u32 op; size_t frb; u64 vbo, lbo, bytes; struct buffer_head *bh; if (iw + 1 == new_wnd) wbits = new_last; op = b0 + bits > wbits ? wbits - b0 : bits; vbo = (u64)iw * blocksize; err = ntfs_vbo_to_lbo(sbi, &wnd->run, vbo, &lbo, &bytes); if (err) return err; bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); if (!bh) return -EIO; lock_buffer(bh); ntfs_bitmap_clear_le(bh->b_data, b0, blocksize * 8 - b0); frb = wbits - ntfs_bitmap_weight_le(bh->b_data, wbits); wnd->total_zeroes += frb - wnd->free_bits[iw]; wnd->free_bits[iw] = frb; set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); /* err = sync_dirty_buffer(bh); */ b0 = 0; bits -= op; } wnd->nbits = new_bits; wnd->nwnd = new_wnd; wnd->bits_last = new_last; wnd_add_free_ext(wnd, old_bits, new_bits - old_bits, false); return 0; } void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len) { size_t zlen = wnd->zone_end - wnd->zone_bit; if (zlen) wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false); if (!RB_EMPTY_ROOT(&wnd->start_tree) && len) wnd_remove_free_ext(wnd, lcn, len); wnd->zone_bit = lcn; wnd->zone_end = lcn + len; } int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) { int err = 0; struct super_block *sb = sbi->sb; struct wnd_bitmap *wnd = &sbi->used.bitmap; u32 wbits = 8 * sb->s_blocksize; CLST len = 0, lcn = 0, done = 0; CLST minlen = bytes_to_cluster(sbi, range->minlen); CLST lcn_from = bytes_to_cluster(sbi, range->start); size_t iw = lcn_from >> (sb->s_blocksize_bits + 3); u32 wbit = lcn_from & (wbits - 1); CLST lcn_to; if (!minlen) minlen = 1; if (range->len == (u64)-1) lcn_to = wnd->nbits; else lcn_to = bytes_to_cluster(sbi, range->start + range->len); down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); for (; iw < wnd->nwnd; iw++, wbit = 0) { CLST lcn_wnd = iw * wbits; struct buffer_head *bh; if (lcn_wnd > lcn_to) break; if (!wnd->free_bits[iw]) continue; if (iw + 1 == wnd->nwnd) wbits = wnd->bits_last; if (lcn_wnd + wbits > lcn_to) wbits = lcn_to - lcn_wnd; bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { err = PTR_ERR(bh); break; } for (; wbit < wbits; wbit++) { if (!test_bit_le(wbit, bh->b_data)) { if (!len) lcn = lcn_wnd + wbit; len += 1; continue; } if (len >= minlen) { err = ntfs_discard(sbi, lcn, len); if (err) goto out; done += len; } len = 0; } put_bh(bh); } /* Process the last fragment. */ if (len >= minlen) { err = ntfs_discard(sbi, lcn, len); if (err) goto out; done += len; } out: range->len = (u64)done << sbi->cluster_bits; up_read(&wnd->rw_lock); return err; } #if BITS_PER_LONG == 64 typedef __le64 bitmap_ulong; #define cpu_to_ul(x) cpu_to_le64(x) #define ul_to_cpu(x) le64_to_cpu(x) #else typedef __le32 bitmap_ulong; #define cpu_to_ul(x) cpu_to_le32(x) #define ul_to_cpu(x) le32_to_cpu(x) #endif void ntfs_bitmap_set_le(void *map, unsigned int start, int len) { bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); bitmap_ulong mask_to_set = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); while (len - bits_to_set >= 0) { *p |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = cpu_to_ul(~0UL); p++; } if (len) { mask_to_set &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); *p |= mask_to_set; } } void ntfs_bitmap_clear_le(void *map, unsigned int start, int len) { bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start); const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); bitmap_ulong mask_to_clear = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start)); while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = cpu_to_ul(~0UL); p++; } if (len) { mask_to_clear &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size)); *p &= ~mask_to_clear; } } unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits) { const ulong *bmp = bitmap; unsigned int k, lim = bits / BITS_PER_LONG; unsigned int w = 0; for (k = 0; k < lim; k++) w += hweight_long(bmp[k]); if (bits % BITS_PER_LONG) { w += hweight_long(ul_to_cpu(((bitmap_ulong *)bitmap)[k]) & BITMAP_LAST_WORD_MASK(bits)); } return w; }
886 5 1408 1224 1958 840 657 290 359 639 729 451 389 1004 2255 2257 1758 822 508 1234 1303 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> (C) 2002 David Woodhouse <dwmw2@infradead.org> (C) 2012 Michel Lespinasse <walken@google.com> linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } static __always_inline struct rb_node * rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *), const struct rb_augment_callbacks *augment) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); augment->propagate(parent, NULL); /* suboptimal */ rb_insert_augmented_cached(node, tree, leftmost, augment); return leftmost ? node : NULL; } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p + color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } static inline void __rb_change_child_rcu(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) rcu_assign_pointer(parent->rb_left, new); else rcu_assign_pointer(parent->rb_right, new); } else rcu_assign_pointer(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */
27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/rethook.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; #if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif static inline unsigned long unwind_recover_rethook(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { #ifdef CONFIG_RETHOOK if (is_rethook_trampoline(addr)) return rethook_find_ret_addr(state->task, (unsigned long)addr_p, &state->kr_cur); #endif return addr; } /* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { unsigned long ret; ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return unwind_recover_rethook(state, ret, addr_p); } /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */
33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BITOPS_H #define _LINUX_BITOPS_H #include <asm/types.h> #include <linux/bits.h> #include <linux/typecheck.h> #include <uapi/linux/kernel.h> #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) #define BYTES_TO_BITS(nb) ((nb) * BITS_PER_BYTE) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); /* * Defined here because those may be needed by architecture-specific static * inlines. */ #include <asm-generic/bitops/generic-non-atomic.h> /* * Many architecture-specific non-atomic bitops contain inline asm code and due * to that the compiler can't optimize them to compile-time expressions or * constants. In contrary, generic_*() helpers are defined in pure C and * compilers optimize them just well. * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when * the arguments can be resolved at compile time. That expression itself is a * constant and doesn't bring any functional changes to the rest of cases. * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when * passing a bitmap from .bss or .data (-> `!!addr` is always true). */ #define bitop(op, nr, addr) \ ((__builtin_constant_p(nr) && \ __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) && \ (uintptr_t)(addr) != (uintptr_t)NULL && \ __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) /* * The following macros are non-atomic versions of their non-underscored * counterparts. */ #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include <asm/bitops.h> /* Check that the bitops prototypes are sane */ #define __check_bitop_pr(name) \ static_assert(__same_type(arch_##name, generic_##name) && \ __same_type(const_##name, generic_##name) && \ __same_type(_##name, generic_##name)) __check_bitop_pr(__set_bit); __check_bitop_pr(__clear_bit); __check_bitop_pr(__change_bit); __check_bitop_pr(__test_and_set_bit); __check_bitop_pr(__test_and_clear_bit); __check_bitop_pr(__test_and_change_bit); __check_bitop_pr(test_bit); __check_bitop_pr(test_bit_acquire); #undef __check_bitop_pr static inline int get_bitmask_order(unsigned int count) { int order; order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** * rol64 - rotate a 64-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u64 rol64(__u64 word, unsigned int shift) { return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** * ror64 - rotate a 64-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u64 ror64(__u64 word, unsigned int shift) { return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** * rol32 - rotate a 32-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** * ror32 - rotate a 32-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u32 ror32(__u32 word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** * rol16 - rotate a 16-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u16 rol16(__u16 word, unsigned int shift) { return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** * ror16 - rotate a 16-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u16 ror16(__u16 word, unsigned int shift) { return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** * rol8 - rotate an 8-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u8 rol8(__u8 word, unsigned int shift) { return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** * ror8 - rotate an 8-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u8 ror8(__u8 word, unsigned int shift) { return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit * * This is safe to use for 16- and 8-bit types as well. */ static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; } /** * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; } static inline unsigned int fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); return fls64(l); } static inline int get_count_order(unsigned int count) { if (count == 0) return -1; return fls(--count); } /** * get_count_order_long - get order after rounding @l up to power of 2 * @l: parameter * * it is same as get_count_order() but with long type parameter */ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; return (int)fls_long(--l); } /** * parity8 - get the parity of an u8 value * @value: the value to be examined * * Determine the parity of the u8 argument. * * Returns: * 0 for even parity, 1 for odd parity * * Note: This function informs you about the current parity. Example to bail * out when parity is odd: * * if (parity8(val) == 1) * return -EBADMSG; * * If you need to calculate a parity bit, you need to draw the conclusion from * this result yourself. Example to enforce odd parity, parity bit is bit 7: * * if (parity8(val) == 0) * val ^= BIT(7); */ static inline int parity8(u8 val) { /* * One explanation of this algorithm: * https://funloop.org/codex/problem/parity/README.html */ val ^= val >> 4; return (0x6996 >> (val & 0xf)) & 1; } /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ static inline unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) return __ffs((u32)(word >> 32)) + 32; #elif BITS_PER_LONG != 64 #error BITS_PER_LONG not 32 or 64 #endif return __ffs((unsigned long)word); } /** * fns - find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned int fns(unsigned long word, unsigned int n) { while (word && n--) word &= word - 1; return word ? __ffs(word) : BITS_PER_LONG; } /** * assign_bit - Assign value to a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @value: the value to assign */ #define assign_bit(nr, addr, value) \ ((value) ? set_bit((nr), (addr)) : clear_bit((nr), (addr))) #define __assign_bit(nr, addr, value) \ ((value) ? __set_bit((nr), (addr)) : __clear_bit((nr), (addr))) /** * __ptr_set_bit - Set bit in a pointer's value * @nr: the bit to set * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_set_bit(bit, &p); */ #define __ptr_set_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __set_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_clear_bit - Clear bit in a pointer's value * @nr: the bit to clear * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_clear_bit(bit, &p); */ #define __ptr_clear_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __clear_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_test_bit - Test bit in a pointer's value * @nr: the bit to test * @addr: the address of the pointer variable * * Example: * void *p = foo(); * if (__ptr_test_bit(bit, &p)) { * ... * } else { * ... * } */ #define __ptr_test_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ test_bit(nr, (unsigned long *)(addr)); \ }) #ifdef __KERNEL__ #ifndef set_mask_bits #define set_mask_bits(ptr, mask, bits) \ ({ \ const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ new__ = (old__ & ~mask__) | bits__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) #endif #ifndef bit_clear_unless #define bit_clear_unless(ptr, clear, test) \ ({ \ const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ if (old__ & test__) \ break; \ new__ = old__ & ~clear__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) #endif #endif /* __KERNEL__ */ #endif
12 9 1 57 55 43 24 44 1 43 43 5 43 43 43 44 1 42 42 42 6 3 2 1 1 6 32 26 30 30 26 7 26 24 24 8 25 1 38 25 23 25 25 1 24 24 4 3 3 12 12 1 12 1 10 7 1 1 1 12 1 12 12 4 5 4 5 3 6 6 3 4 4 2 1 1 3 1 1 4 6 6 5 6 18 1 16 35 33 35 35 3 35 33 2 3 35 35 35 35 35 4 35 35 34 35 35 35 35 35 35 2 35 3 1 1 35 19 35 5 1 1 4 4 35 5 35 35 21 21 21 21 21 6 4 3 1 4 3 3 35 10 33 21 13 21 21 21 21 21 16 1 1 14 14 14 14 3 3 2 3 14 13 13 14 3 3 4 4 2 2 12 12 10 12 11 1 1 2 18 2 1 19 12 7 2 7 19 19 18 1 26 3 4 2 1 1 21 2 12 12 12 7 22 10 8 10 10 8 6 1 5 5 6 6 6 6 6 17 1 2 3 1 11 11 11 11 10 10 1 4 1 3 2 1 2 7 17 32 4 5 2 7 5 9 7 7 25 62 63 2 62 1 57 1 29 18 29 11 9 15 29 29 43 12 5 7 2 5 49 49 43 33 3 7 7 6 7 36 36 28 26 15 26 43 9 3 7 41 32 11 2 9 1 1 1 1 1 36 32 47 19 34 30 34 1 1 28 73 64 2 63 64 61 3 61 3 63 64 42 22 43 22 45 71 1 73 18 18 17 18 18 1 18 18 1 18 18 18 17 18 17 17 1 18 1 18 18 17 3 1 2 18 4 18 18 17 18 17 17 18 3 17 18 18 17 18 18 18 17 18 18 1 8 18 18 1 18 18 18 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> #include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/sched/mm.h> #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> #include <linux/minmax.h> #include <linux/overflow.h> #include <linux/buildid.h> #include <asm/elf.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "internal.h" #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8) void task_mem(struct seq_file *m, struct mm_struct *mm) { unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES); shmem = get_mm_counter(mm, MM_SHMEMPAGES); /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any * collector of these hiwater stats must therefore get total_vm * and rss too, which will usually be the higher. Barriers? not * worth the effort, such snapshots can always be inconsistent. */ hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; /* split executable areas between text and lib */ text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); text = min(text, mm->exec_vm << PAGE_SHIFT); lib = (mm->exec_vm << PAGE_SHIFT) - text; swap = get_mm_counter(mm, MM_SWAPENTS); SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); SEQ_PUT_DEC(" kB\nRssFile:\t", file); SEQ_PUT_DEC(" kB\nRssShmem:\t", shmem); SEQ_PUT_DEC(" kB\nVmData:\t", mm->data_vm); SEQ_PUT_DEC(" kB\nVmStk:\t", mm->stack_vm); seq_put_decimal_ull_width(m, " kB\nVmExe:\t", text >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmLib:\t", lib >> 10, 8); seq_put_decimal_ull_width(m, " kB\nVmPTE:\t", mm_pgtables_bytes(mm) >> 10, 8); SEQ_PUT_DEC(" kB\nVmSwap:\t", swap); seq_puts(m, " kB\n"); hugetlb_report_usage(m, mm); } #undef SEQ_PUT_DEC unsigned long task_vsize(struct mm_struct *mm) { return PAGE_SIZE * mm->total_vm; } unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { *shared = get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } #ifdef CONFIG_NUMA /* * Save get_task_policy() for show_numa_map(). */ static void hold_task_mempolicy(struct proc_maps_private *priv) { struct task_struct *task = priv->task; task_lock(task); priv->task_mempolicy = get_task_policy(task); mpol_get(priv->task_mempolicy); task_unlock(task); } static void release_task_mempolicy(struct proc_maps_private *priv) { mpol_put(priv->task_mempolicy); } #else static void hold_task_mempolicy(struct proc_maps_private *priv) { } static void release_task_mempolicy(struct proc_maps_private *priv) { } #endif static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, loff_t *ppos) { struct vm_area_struct *vma = vma_next(&priv->iter); if (vma) { *ppos = vma->vm_start; } else { *ppos = -2UL; vma = get_gate_vma(priv->mm); } return vma; } static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; } if (mmap_read_lock_killable(mm)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; return ERR_PTR(-EINTR); } vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); if (last_addr == -2UL) return get_gate_vma(mm); return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { if (*ppos == -2UL) { *ppos = -1UL; return NULL; } return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mm_struct *mm = priv->mm; if (!priv->task) return; release_task_mempolicy(priv); mmap_read_unlock(mm); mmput(mm); put_task_struct(priv->task); priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops, int psize) { struct proc_maps_private *priv = __seq_open_private(file, ops, psize); if (!priv) return -ENOMEM; priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(priv->mm)) { int err = PTR_ERR(priv->mm); seq_release_private(inode, file); return err; } return 0; } static int proc_map_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->mm) mmdrop(priv->mm); return seq_release_private(inode, file); } static int do_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { return proc_maps_open(inode, file, ops, sizeof(struct proc_maps_private)); } static void get_vma_name(struct vm_area_struct *vma, const struct path **path, const char **name, const char **name_fmt) { struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; *name = NULL; *path = NULL; *name_fmt = NULL; /* * Print the dentry name for named mappings, and a * special [heap] marker for the heap: */ if (vma->vm_file) { /* * If user named this anon shared memory via * prctl(PR_SET_VMA ..., use the provided name. */ if (anon_name) { *name_fmt = "[anon_shmem:%s]"; *name = anon_name->name; } else { *path = file_user_path(vma->vm_file); } return; } if (vma->vm_ops && vma->vm_ops->name) { *name = vma->vm_ops->name(vma); if (*name) return; } *name = arch_vma_name(vma); if (*name) return; if (!vma->vm_mm) { *name = "[vdso]"; return; } if (vma_is_initial_heap(vma)) { *name = "[heap]"; return; } if (vma_is_initial_stack(vma)) { *name = "[stack]"; return; } if (anon_name) { *name_fmt = "[anon:%s]"; *name = anon_name->name; return; } } static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, dev_t dev, unsigned long ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_put_hex_ll(m, NULL, start, 8); seq_put_hex_ll(m, "-", end, 8); seq_putc(m, ' '); seq_putc(m, flags & VM_READ ? 'r' : '-'); seq_putc(m, flags & VM_WRITE ? 'w' : '-'); seq_putc(m, flags & VM_EXEC ? 'x' : '-'); seq_putc(m, flags & VM_MAYSHARE ? 's' : 'p'); seq_put_hex_ll(m, " ", pgoff, 8); seq_put_hex_ll(m, " ", MAJOR(dev), 2); seq_put_hex_ll(m, ":", MINOR(dev), 2); seq_put_decimal_ull(m, " ", ino); seq_putc(m, ' '); } static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { const struct path *path; const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); get_vma_name(vma, &path, &name, &name_fmt); if (path) { seq_pad(m, ' '); seq_path(m, path, "\n"); } else if (name_fmt) { seq_pad(m, ' '); seq_printf(m, name_fmt, name); } else if (name) { seq_pad(m, ' '); seq_puts(m, name); } seq_putc(m, '\n'); } static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); return 0; } static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; static int pid_maps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_maps_op); } #define PROCMAP_QUERY_VMA_FLAGS ( \ PROCMAP_QUERY_VMA_READABLE | \ PROCMAP_QUERY_VMA_WRITABLE | \ PROCMAP_QUERY_VMA_EXECUTABLE | \ PROCMAP_QUERY_VMA_SHARED \ ) #define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ PROCMAP_QUERY_FILE_BACKED_VMA | \ PROCMAP_QUERY_VMA_FLAGS \ ) static int query_vma_setup(struct mm_struct *mm) { return mmap_read_lock_killable(mm); } static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) { mmap_read_unlock(mm); } static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) { return find_vma(mm, addr); } static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, unsigned long addr, u32 flags) { struct vm_area_struct *vma; next_vma: vma = query_vma_find_by_addr(mm, addr); if (!vma) goto no_vma; /* user requested only file-backed VMA, keep iterating */ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) goto skip_vma; /* VMA permissions should satisfy query flags */ if (flags & PROCMAP_QUERY_VMA_FLAGS) { u32 perm = 0; if (flags & PROCMAP_QUERY_VMA_READABLE) perm |= VM_READ; if (flags & PROCMAP_QUERY_VMA_WRITABLE) perm |= VM_WRITE; if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) perm |= VM_EXEC; if (flags & PROCMAP_QUERY_VMA_SHARED) perm |= VM_MAYSHARE; if ((vma->vm_flags & perm) != perm) goto skip_vma; } /* found covering VMA or user is OK with the matching next VMA */ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) return vma; skip_vma: /* * If the user needs closest matching VMA, keep iterating. */ addr = vma->vm_end; if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) goto next_vma; no_vma: return ERR_PTR(-ENOENT); } static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) { struct procmap_query karg; struct vm_area_struct *vma; struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize; int err; if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) return -EFAULT; /* argument struct can never be that large, reject abuse */ if (usize > PAGE_SIZE) return -E2BIG; /* argument struct should have at least query_flags and query_addr fields */ if (usize < offsetofend(struct procmap_query, query_addr)) return -EINVAL; err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); if (err) return err; /* reject unknown flags */ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) return -EINVAL; /* either both buffer address and size are set, or both should be zero */ if (!!karg.vma_name_size != !!karg.vma_name_addr) return -EINVAL; if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL; mm = priv->mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH; err = query_vma_setup(mm); if (err) { mmput(mm); return err; } vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL; goto out; } karg.vma_start = vma->vm_start; karg.vma_end = vma->vm_end; karg.vma_flags = 0; if (vma->vm_flags & VM_READ) karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; if (vma->vm_flags & VM_WRITE) karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; if (vma->vm_flags & VM_EXEC) karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; if (vma->vm_flags & VM_MAYSHARE) karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; karg.vma_page_size = vma_kernel_pagesize(vma); if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; karg.dev_major = MAJOR(inode->i_sb->s_dev); karg.dev_minor = MINOR(inode->i_sb->s_dev); karg.inode = inode->i_ino; } else { karg.vma_offset = 0; karg.dev_major = 0; karg.dev_minor = 0; karg.inode = 0; } if (karg.build_id_size) { __u32 build_id_sz; err = build_id_parse(vma, build_id_buf, &build_id_sz); if (err) { karg.build_id_size = 0; } else { if (karg.build_id_size < build_id_sz) { err = -ENAMETOOLONG; goto out; } karg.build_id_size = build_id_sz; } } if (karg.vma_name_size) { size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); const struct path *path; const char *name_fmt; size_t name_sz = 0; get_vma_name(vma, &path, &name, &name_fmt); if (path || name_fmt || name) { name_buf = kmalloc(name_buf_sz, GFP_KERNEL); if (!name_buf) { err = -ENOMEM; goto out; } } if (path) { name = d_path(path, name_buf, name_buf_sz); if (IS_ERR(name)) { err = PTR_ERR(name); goto out; } name_sz = name_buf + name_buf_sz - name; } else if (name || name_fmt) { name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); name = name_buf; } if (name_sz > name_buf_sz) { err = -ENAMETOOLONG; goto out; } karg.vma_name_size = name_sz; } /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ query_vma_teardown(mm, vma); mmput(mm); if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), name, karg.vma_name_size)) { kfree(name_buf); return -EFAULT; } kfree(name_buf); if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), build_id_buf, karg.build_id_size)) return -EFAULT; if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) return -EFAULT; return 0; out: query_vma_teardown(mm, vma); mmput(mm); kfree(name_buf); return err; } static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; switch (cmd) { case PROCMAP_QUERY: return do_procmap_query(priv, (void __user *)arg); default: return -ENOIOCTLCMD; } } const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, .unlocked_ioctl = procfs_procmap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * Proportional Set Size(PSS): my share of RSS. * * PSS of a process is the count of pages it has in memory, where each * page is divided by the number of processes sharing it. So if a * process has 1000 pages all to itself, and 1000 shared with one other * process, its PSS will be 1500. * * To keep (accumulated) division errors low, we adopt a 64bit * fixed-point pss counter to minimize division errors. So (pss >> * PSS_SHIFT) would be the real byte count. * * A shift of 12 before division means (assuming 4K page size): * - 1M 3-user-pages add up to 8KB errors; * - supports mapcount up to 2^24, or 16M; * - supports PSS up to 2^52 bytes, or 4PB. */ #define PSS_SHIFT 12 #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; u64 pss_shmem; u64 pss_dirty; u64 pss_locked; u64 swap_pss; }; static void smaps_page_accumulate(struct mem_size_stats *mss, struct folio *folio, unsigned long size, unsigned long pss, bool dirty, bool locked, bool private) { mss->pss += pss; if (folio_test_anon(folio)) mss->pss_anon += pss; else if (folio_test_swapbacked(folio)) mss->pss_shmem += pss; else mss->pss_file += pss; if (locked) mss->pss_locked += pss; if (dirty || folio_test_dirty(folio)) { mss->pss_dirty += pss; if (private) mss->private_dirty += size; else mss->shared_dirty += size; } else { if (private) mss->private_clean += size; else mss->shared_clean += size; } } static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, bool present) { struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; /* * First accumulate quantities that depend only on |size| and the type * of the compound page. */ if (folio_test_anon(folio)) { mss->anonymous += size; if (!folio_test_swapbacked(folio) && !dirty && !folio_test_dirty(folio)) mss->lazyfree += size; } if (folio_test_ksm(folio)) mss->ksm += size; mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || folio_test_young(folio) || folio_test_referenced(folio)) mss->referenced += size; /* * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * * refcount == 1 for present entries guarantees that the folio is mapped * exactly once. For large folios this implies that exactly one * PTE/PMD/... maps (a part of) this folio. * * Treat all non-present entries (where relying on the mapcount and * refcount doesn't make sense) as "maybe shared, but not sure how * often". We treat device private entries as being fake-present. * * Note that it would not be safe to read the mapcount especially for * pages referenced by migration entries, even with the PTL held. */ if (folio_ref_count(folio) == 1 || !present) { smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, dirty, locked, present); return; } /* * We obtain a snapshot of the mapcount. Without holding the folio lock * this snapshot can be slightly wrong as we cannot always read the * mapcount atomically. */ for (i = 0; i < nr; i++, page++) { int mapcount = folio_precise_page_mapcount(folio, page); unsigned long pss = PAGE_SIZE << PSS_SHIFT; if (mapcount >= 2) pss /= mapcount; smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, dirty, locked, mapcount < 2); } } #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, linear_page_index(vma, addr), linear_page_index(vma, end)); return 0; } #else #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) { #ifdef CONFIG_SHMEM if (walk->ops->pte_hole) { /* depth is not used */ smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); } #endif } static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false, young = false, dirty = false; pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { page = vm_normal_page(vma, addr, ptent); young = pte_young(ptent); dirty = pte_dirty(ptent); present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; mss->swap += PAGE_SIZE; mapcount = swp_swapcount(swpent); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; do_div(pss_delta, mapcount); mss->swap_pss += pss_delta; } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } } else if (is_pfn_swap_entry(swpent)) { if (is_device_private_entry(swpent)) present = true; page = pfn_swap_entry_to_page(swpent); } } else { smaps_pte_hole_lookup(addr, walk); return; } if (!page) return; smaps_account(mss, page, false, young, dirty, locked, present); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool present = false; struct folio *folio; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); present = true; } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; folio = page_folio(page); if (folio_test_anon(folio)) mss->anonymous_thp += HPAGE_PMD_SIZE; else if (folio_test_swapbacked(folio)) mss->shmem_thp += HPAGE_PMD_SIZE; else if (folio_is_zone_device(folio)) /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked, present); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { } #endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); out: cond_resched(); return 0; } static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) { /* * Don't forget to update Documentation/ on changes. * * The length of the second argument of mnemonics[] * needs to be 3 instead of previously set 2 * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3]) * to avoid spurious * -Werror=unterminated-string-initialization warning * with GCC 15 */ static const char mnemonics[BITS_PER_LONG][3] = { /* * In case if we meet a flag we don't know about. */ [0 ... (BITS_PER_LONG-1)] = "??", [ilog2(VM_READ)] = "rd", [ilog2(VM_WRITE)] = "wr", [ilog2(VM_EXEC)] = "ex", [ilog2(VM_SHARED)] = "sh", [ilog2(VM_MAYREAD)] = "mr", [ilog2(VM_MAYWRITE)] = "mw", [ilog2(VM_MAYEXEC)] = "me", [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_ARM64_BTI [ilog2(VM_ARM64_BTI)] = "bt", #endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif [ilog2(VM_MIXEDMAP)] = "mm", [ilog2(VM_HUGEPAGE)] = "hg", [ilog2(VM_NOHUGEPAGE)] = "nh", [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", #ifdef CONFIG_ARM64_MTE [ilog2(VM_MTE)] = "mt", [ilog2(VM_MTE_ALLOWED)] = "", #endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", #if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", #endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ #ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) [ilog2(VM_DROPPABLE)] = "dp", #endif #ifdef CONFIG_64BIT [ilog2(VM_SEALED)] = "sl", #endif }; size_t i; seq_puts(m, "VmFlags: "); for (i = 0; i < BITS_PER_LONG; i++) { if (!mnemonics[i][0]) continue; if (vma->vm_flags & (1UL << i)) seq_printf(m, "%s ", mnemonics[i]); } seq_putc(m, '\n'); } #ifdef CONFIG_HUGETLB_PAGE static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; pte_t ptent = huge_ptep_get(walk->mm, addr, pte); struct folio *folio = NULL; bool present = false; if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) folio = pfn_swap_entry_folio(swpent); } if (folio) { /* We treat non-present entries as "maybe shared". */ if (!present || folio_likely_mapped_shared(folio) || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else mss->private_hugetlb += huge_page_size(hstate_vma(vma)); } return 0; } #else #define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, .walk_lock = PGWALK_RDLOCK, }; /* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. * * Use vm_start of @vma as the beginning address if @start is 0. */ static void smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss, unsigned long start) { const struct mm_walk_ops *ops = &smaps_walk_ops; /* Invalid start */ if (start >= vma->vm_end) return; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all * swapped out pages belong to the shmem object, and we can * obtain the swap value much more efficiently. For private * writable mappings, we might have COW pages that are * not affected by the parent swapped out pages of the shmem * object, so we have to distinguish them during the page walk. * Unless we know that the shmem object (or the part mapped by * our VMA) has no swapped out pages at all. */ unsigned long shmem_swapped = shmem_swap_usage(vma); if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { ops = &smaps_shmem_walk_ops; } } /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); else walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); } #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) /* Show the contents common for smaps and smaps_rollup */ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, bool rollup_mode) { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); if (rollup_mode) { /* * These are meaningful only for smaps_rollup, otherwise two of * them are zero, and the other one is the same as Pss. */ SEQ_PUT_DEC(" kB\nPss_Anon: ", mss->pss_anon >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_File: ", mss->pss_file >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nPss_Shmem: ", mss->pss_shmem >> PSS_SHIFT); } SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); SEQ_PUT_DEC(" kB\nSwapPss: ", mss->swap_pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nLocked: ", mss->pss_locked >> PSS_SHIFT); seq_puts(m, " kB\n"); } static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss = {}; smap_gather_stats(vma, &mss, 0); show_map_vma(m, vma); SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); seq_puts(m, " kB\n"); __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); return 0; } static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {}; struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; } ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; hold_task_mempolicy(priv); vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; vma_start = vma->vm_start; do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; /* * Release mmap_lock temporarily if someone wants to * access it for write request. */ if (mmap_lock_is_contended(mm)) { vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { release_task_mempolicy(priv); goto out_put_mm; } /* * After dropping the lock, there are four cases to * consider. See the following example for explanation. * * +------+------+-----------+ * | VMA1 | VMA2 | VMA3 | * +------+------+-----------+ * | | | | * 4k 8k 16k 400k * * Suppose we drop the lock after reading VMA2 due to * contention, then we get: * * last_vma_end = 16k * * 1) VMA2 is freed, but VMA3 exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; continue; } /* Case 4 above */ if (vma->vm_end > last_vma_end) { smap_gather_stats(vma, &mss, last_vma_end); last_vma_end = vma->vm_end; } } } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); __show_smap(m, &mss, true); release_task_mempolicy(priv); mmap_read_unlock(mm); out_put_mm: mmput(mm); out_put_task: put_task_struct(priv->task); priv->task = NULL; return ret; } #undef SEQ_PUT_DEC static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; static int pid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_pid_smaps_op); } static int smaps_rollup_open(struct inode *inode, struct file *file) { int ret; struct proc_maps_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); if (!priv) return -ENOMEM; ret = single_open(file, show_smaps_rollup, priv); if (ret) goto out_free; priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(priv->mm)) { ret = PTR_ERR(priv->mm); single_release(inode, file); goto out_free; } return 0; out_free: kfree(priv); return ret; } static int smaps_rollup_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; if (priv->mm) mmdrop(priv->mm); kfree(priv); return single_release(inode, file); } const struct file_operations proc_pid_smaps_operations = { .open = pid_smaps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; const struct file_operations proc_pid_smaps_rollup_operations = { .open = smaps_rollup_open, .read = seq_read, .llseek = seq_lseek, .release = smaps_rollup_release, }; enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, CLEAR_REFS_SOFT_DIRTY, CLEAR_REFS_MM_HIWATER_RSS, CLEAR_REFS_LAST, }; struct clear_refs_private { enum clear_refs_types type; }; #ifdef CONFIG_MEM_SOFT_DIRTY static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct folio *folio; if (!pte_write(pte)) return false; if (!is_cow_mapping(vma->vm_flags)) return false; if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) return false; folio = vm_normal_folio(vma, addr, pte); if (!folio) return false; return folio_maybe_dma_pinned(folio); } static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; if (pte_is_pinned(vma, addr, ptent)) return; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } #else static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } #endif #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { } #endif static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; } if (!pmd_present(*pmd)) goto out; folio = pmd_folio(*pmd); /* Clear accessed and referenced bits. */ pmdp_test_and_clear_young(vma, addr, pmd); folio_test_clear_young(folio); folio_clear_referenced(folio); out: spin_unlock(ptl); return 0; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); continue; } if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio) continue; /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); folio_test_clear_young(folio); folio_clear_referenced(folio); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; } static int clear_refs_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; if (vma->vm_flags & VM_PFNMAP) return 1; /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (cp->type == CLEAR_REFS_ANON && vma->vm_file) return 1; if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) return 1; return 0; } static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; type = (enum clear_refs_types)itype; if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; mm = get_task_mm(task); if (mm) { VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; if (mmap_write_lock_killable(mm)) { count = -EINTR; goto out_mm; } if (type == CLEAR_REFS_MM_HIWATER_RSS) { /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); goto out_unlock; } if (type == CLEAR_REFS_SOFT_DIRTY) { for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); dec_tlb_flush_pending(mm); } out_unlock: mmap_write_unlock(mm); out_mm: mmput(mm); } put_task_struct(task); return count; } const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, .llseek = noop_llseek, }; typedef struct { u64 pme; } pagemap_entry_t; struct pagemapread { int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ pagemap_entry_t *buffer; bool show_pfn; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) #define PAGEMAP_WALK_MASK (PMD_MASK) #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_UFFD_WP BIT_ULL(57) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 frame, u64 flags) { return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; } static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) { pm->buffer[pm->pos++] = *pme; if (pm->pos >= pm->len) return PM_END_OF_BUFFER; return 0; } static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; int err = 0; while (addr < end) { struct vm_area_struct *vma = find_vma(walk->mm, addr); pagemap_entry_t pme = make_pme(0, 0); /* End of address space hole, which we mark as non-present. */ unsigned long hole_end; if (vma) hole_end = min(end, vma->vm_start); else hole_end = end; for (; addr < hole_end; addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } if (!vma) break; /* Addresses in the VMA. */ if (vma->vm_flags & VM_SOFTDIRTY) pme = make_pme(0, PM_SOFT_DIRTY); for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { err = add_to_pagemap(&pme, pm); if (err) goto out; } } out: return err; } static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame = 0, flags = 0; struct page *page = NULL; struct folio *folio; if (pte_present(pte)) { if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); if (pm->show_pfn) { pgoff_t offset; /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ if (is_pfn_swap_entry(entry)) offset = swp_offset_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); if (pte_marker_entry_uffd_wp(entry)) flags |= PM_UFFD_WP; } if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; if ((flags & PM_PRESENT) && folio_precise_page_mapcount(folio, page) == 1) flags |= PM_MMAP_EXCLUSIVE; } if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; return make_pme(frame, flags); } static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct pagemapread *pm = walk->private; spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; struct page *page = NULL; struct folio *folio = NULL; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; if (pmd_present(pmd)) { page = pmd_page(pmd); flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + idx; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); unsigned long offset; if (pm->show_pfn) { if (is_pfn_swap_entry(entry)) offset = swp_offset_pfn(entry) + idx; else offset = swp_offset(entry) + idx; frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); page = pfn_swap_entry_to_page(entry); } #endif if (page) { folio = page_folio(page); if (!folio_test_anon(folio)) flags |= PM_FILE; } for (; addr != end; addr += PAGE_SIZE, idx++) { u64 cur_flags = flags; pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && folio_precise_page_mapcount(folio, page + idx) == 1) cur_flags |= PM_MMAP_EXCLUSIVE; pme = make_pme(frame, cur_flags); err = add_to_pagemap(&pme, pm); if (err) break; if (pm->show_pfn) { if (flags & PM_PRESENT) frame++; else if (flags & PM_SWAP) frame += (1 << MAX_SWAPFILES_SHIFT); } } spin_unlock(ptl); return err; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * We can assume that @vma always points to a valid one and @end never * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return err; } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(&pme, pm); if (err) break; } pte_unmap_unlock(orig_pte, ptl); cond_resched(); return err; } #ifdef CONFIG_HUGETLB_PAGE /* This function walks within one hugetlb entry in the single call */ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct pagemapread *pm = walk->private; struct vm_area_struct *vma = walk->vma; u64 flags = 0, frame = 0; int err = 0; pte_t pte; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { struct folio *folio = page_folio(pte_page(pte)); if (!folio_test_anon(folio)) flags |= PM_FILE; if (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; if (huge_pte_uffd_wp(pte)) flags |= PM_UFFD_WP; flags |= PM_PRESENT; if (pm->show_pfn) frame = pte_pfn(pte) + ((addr & ~hmask) >> PAGE_SHIFT); } else if (pte_swp_uffd_wp_any(pte)) { flags |= PM_UFFD_WP; } for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); err = add_to_pagemap(&pme, pm); if (err) return err; if (pm->show_pfn && (flags & PM_PRESENT)) frame++; } cond_resched(); return err; } #else #define pagemap_hugetlb_range NULL #endif /* HUGETLB_PAGE */ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, .walk_lock = PGWALK_RDLOCK, }; /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * * For each page in the address space, this file contains one 64-bit entry * consisting of the following: * * Bits 0-54 page frame number (PFN) if present * Bits 0-4 swap type if swapped * Bits 5-54 swap offset if swapped * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected * Bits 58-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present * * If the page is not present but in swap, then the PFN contains an * encoding of the swap file number and the page's offset into the * swap. Unmapped pages return a null PFN. This allows determining * precisely which pages are mapped (or in swap) and comparing mapped * pages between processes. * * Efficient users of this interface will use /proc/pid/maps to * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct mm_struct *mm = file->private_data; struct pagemapread pm; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; int ret = 0, copied = 0; if (!mm || !mmget_not_zero(mm)) goto out; ret = -EINVAL; /* file position must be aligned */ if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) goto out_mm; ret = 0; if (!count) goto out_mm; /* do not disclose physical addresses: attack vector */ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; src = *ppos; svpfn = src / PM_ENTRY_BYTES; end_vaddr = mm->task_size; /* watch out for wraparound */ start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { unsigned long end; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); mmap_read_unlock(mm); end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); if (end >= start_vaddr && end < mm->task_size) end_vaddr = end; } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; unsigned long end; pm.pos = 0; end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; ret = mmap_read_lock_killable(mm); if (ret) goto out_free; ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); mmap_read_unlock(mm); start_vaddr = end; len = min(count, PM_ENTRY_BYTES * pm.pos); if (copy_to_user(buf, pm.buffer, len)) { ret = -EFAULT; goto out_free; } copied += len; buf += len; count -= len; } *ppos += copied; if (!ret || ret == PM_END_OF_BUFFER) ret = copied; out_free: kfree(pm.buffer); out_mm: mmput(mm); out: return ret; } static int pagemap_open(struct inode *inode, struct file *file) { struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); if (IS_ERR(mm)) return PTR_ERR(mm); file->private_data = mm; return 0; } static int pagemap_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) mmdrop(mm); return 0; } #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { struct pm_scan_arg arg; unsigned long masks_of_interest, cur_vma_category; struct page_region *vec_buf; unsigned long vec_buf_len, vec_buf_index, found_pages; struct page_region __user *vec_out; }; static unsigned long pagemap_page_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long categories = 0; if (pte_present(pte)) { struct page *page; categories |= PAGE_IS_PRESENT; if (!pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page(vma, addr, pte); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pte_to_swp_entry(pte); if (is_pfn_swap_entry(swp) && !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t ptent) { if (pte_present(ptent)) { pte_t old_pte; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_mkuffd_wp(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } else { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); } } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long categories = PAGE_IS_HUGE; if (pmd_present(pmd)) { struct page *page; categories |= PAGE_IS_PRESENT; if (!pmd_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (p->masks_of_interest & PAGE_IS_FILE) { page = vm_normal_page_pmd(vma, addr, pmd); if (page && !PageAnon(page)) categories |= PAGE_IS_FILE; } if (is_zero_pfn(pmd_pfn(pmd))) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pmd(pmd)) { swp_entry_t swp; categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; if (pmd_swp_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { swp = pmd_to_swp_entry(pmd); if (is_pfn_swap_entry(swp) && !folio_test_anon(pfn_swap_entry_folio(swp))) categories |= PAGE_IS_FILE; } } return categories; } static void make_uffd_wp_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { old = pmdp_invalidate_ad(vma, addr, pmdp); pmd = pmd_mkuffd_wp(old); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HUGETLB_PAGE static unsigned long pagemap_hugetlb_category(pte_t pte) { unsigned long categories = PAGE_IS_HUGE; /* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for * swapped pages. */ if (pte_present(pte)) { categories |= PAGE_IS_PRESENT; if (!huge_pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte))) categories |= PAGE_IS_FILE; if (is_zero_pfn(pte_pfn(pte))) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } else if (is_swap_pte(pte)) { categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } return categories; } static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t ptent) { unsigned long psize; if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) return; psize = huge_page_size(hstate_vma(vma)); if (is_hugetlb_entry_migration(ptent)) set_huge_pte_at(vma->vm_mm, addr, ptep, pte_swp_mkuffd_wp(ptent), psize); else if (!huge_pte_none(ptent)) huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, huge_pte_mkuffd_wp(ptent)); else set_huge_pte_at(vma->vm_mm, addr, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); } #endif /* CONFIG_HUGETLB_PAGE */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) static void pagemap_scan_backout_range(struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; if (cur_buf->start != addr) cur_buf->end = addr; else cur_buf->start = cur_buf->end = 0; p->found_pages -= (end - addr) / PAGE_SIZE; } #endif static bool pagemap_scan_is_interesting_page(unsigned long categories, const struct pagemap_scan_private *p) { categories ^= p->arg.category_inverted; if ((categories & p->arg.category_mask) != p->arg.category_mask) return false; if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) return false; return true; } static bool pagemap_scan_is_interesting_vma(unsigned long categories, const struct pagemap_scan_private *p) { unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; categories ^= p->arg.category_inverted; if ((categories & required) != required) return false; return true; } static int pagemap_scan_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long vma_category = 0; bool wp_allowed = userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma); if (!wp_allowed) { /* User requested explicit failure over wp-async capability */ if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) return -EPERM; /* * User requires wr-protect, and allows silently skipping * unsupported vmas. */ if (p->arg.flags & PM_SCAN_WP_MATCHING) return 1; /* * Then the request doesn't involve wr-protects at all, * fall through to the rest checks, and allow vma walk. */ } if (vma->vm_flags & VM_PFNMAP) return 1; if (wp_allowed) vma_category |= PAGE_IS_WPALLOWED; if (vma->vm_flags & VM_SOFTDIRTY) vma_category |= PAGE_IS_SOFT_DIRTY; if (!pagemap_scan_is_interesting_vma(vma_category, p)) return 1; p->cur_vma_category = vma_category; return 0; } static bool pagemap_scan_push_range(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long end) { struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; /* * When there is no output buffer provided at all, the sentinel values * won't match here. There is no other way for `cur_buf->end` to be * non-zero other than it being non-empty. */ if (addr == cur_buf->end && categories == cur_buf->categories) { cur_buf->end = end; return true; } if (cur_buf->end) { if (p->vec_buf_index >= p->vec_buf_len - 1) return false; cur_buf = &p->vec_buf[++p->vec_buf_index]; } cur_buf->start = addr; cur_buf->end = end; cur_buf->categories = categories; return true; } static int pagemap_scan_output(unsigned long categories, struct pagemap_scan_private *p, unsigned long addr, unsigned long *end) { unsigned long n_pages, total_pages; int ret = 0; if (!p->vec_buf) return 0; categories &= p->arg.return_mask; n_pages = (*end - addr) / PAGE_SIZE; if (check_add_overflow(p->found_pages, n_pages, &total_pages) || total_pages > p->arg.max_pages) { size_t n_too_much = total_pages - p->arg.max_pages; *end -= n_too_much * PAGE_SIZE; n_pages -= n_too_much; ret = -ENOSPC; } if (!pagemap_scan_push_range(categories, p, addr, *end)) { *end = addr; n_pages = 0; ret = -ENOSPC; } p->found_pages += n_pages; if (ret) p->arg.walk_end = *end; return ret; } static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) return -ENOENT; categories = p->cur_vma_category | pagemap_thp_category(p, vma, start, *pmd); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~p->arg.flags & PM_SCAN_WP_MATCHING) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; /* * Break huge page into small pages if the WP operation * needs to be performed on a portion of the huge page. */ if (end != start + HPAGE_SIZE) { spin_unlock(ptl); split_huge_pmd(vma, pmd, start); pagemap_scan_backout_range(p, start, end); /* Report as if there was no THP */ return -ENOENT; } make_uffd_wp_pmd(vma, start, pmd); flush_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); return ret; #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ return -ENOENT; #endif } static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long addr, flush_end = 0; pte_t *pte, *start_pte; spinlock_t *ptl; int ret; arch_enter_lazy_mmu_mode(); ret = pagemap_scan_thp_entry(pmd, start, end, walk); if (ret != -ENOENT) { arch_leave_lazy_mmu_mode(); return ret; } ret = 0; start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); if (!pte) { arch_leave_lazy_mmu_mode(); walk->action = ACTION_AGAIN; return 0; } if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = addr + PAGE_SIZE; } goto flush_and_return; } if (!p->arg.category_anyof_mask && !p->arg.category_inverted && p->arg.category_mask == PAGE_IS_WRITTEN && p->arg.return_mask == PAGE_IS_WRITTEN) { for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { unsigned long next = addr + PAGE_SIZE; pte_t ptent = ptep_get(pte); if ((pte_present(ptent) && pte_uffd_wp(ptent)) || pte_swp_uffd_wp_any(ptent)) continue; ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } goto flush_and_return; } for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(pte); unsigned long categories = p->cur_vma_category | pagemap_page_category(p, vma, addr, ptent); unsigned long next = addr + PAGE_SIZE; if (!pagemap_scan_is_interesting_page(categories, p)) continue; ret = pagemap_scan_output(categories, p, addr, &next); if (next == addr) break; if (~p->arg.flags & PM_SCAN_WP_MATCHING) continue; if (~categories & PAGE_IS_WRITTEN) continue; make_uffd_wp_pte(vma, addr, pte, ptent); if (!flush_end) start = addr; flush_end = next; } flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); pte_unmap_unlock(start_pte, ptl); arch_leave_lazy_mmu_mode(); cond_resched(); return ret; } #ifdef CONFIG_HUGETLB_PAGE static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, unsigned long start, unsigned long end, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; unsigned long categories; spinlock_t *ptl; int ret = 0; pte_t pte; if (~p->arg.flags & PM_SCAN_WP_MATCHING) { /* Go the short route when not write-protecting pages. */ pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) return 0; return pagemap_scan_output(categories, p, start, &end); } i_mmap_lock_write(vma->vm_file->f_mapping); ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) goto out_unlock; ret = pagemap_scan_output(categories, p, start, &end); if (start == end) goto out_unlock; if (~categories & PAGE_IS_WRITTEN) goto out_unlock; if (end != start + HPAGE_SIZE) { /* Partial HugeTLB page WP isn't possible. */ pagemap_scan_backout_range(p, start, end); p->arg.walk_end = start; ret = 0; goto out_unlock; } make_uffd_wp_huge_pte(vma, start, ptep, pte); flush_hugetlb_tlb_range(vma, start, end); out_unlock: spin_unlock(ptl); i_mmap_unlock_write(vma->vm_file->f_mapping); return ret; } #else #define pagemap_scan_hugetlb_entry NULL #endif static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, int depth, struct mm_walk *walk) { struct pagemap_scan_private *p = walk->private; struct vm_area_struct *vma = walk->vma; int ret, err; if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) return 0; ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); if (addr == end) return ret; if (~p->arg.flags & PM_SCAN_WP_MATCHING) return ret; err = uffd_wp_range(vma, addr, end - addr, true); if (err < 0) ret = err; return ret; } static const struct mm_walk_ops pagemap_scan_ops = { .test_walk = pagemap_scan_test_walk, .pmd_entry = pagemap_scan_pmd_entry, .pte_hole = pagemap_scan_pte_hole, .hugetlb_entry = pagemap_scan_hugetlb_entry, }; static int pagemap_scan_get_args(struct pm_scan_arg *arg, unsigned long uarg) { if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) return -EFAULT; if (arg->size != sizeof(struct pm_scan_arg)) return -EINVAL; /* Validate requested features */ if (arg->flags & ~PM_SCAN_FLAGS) return -EINVAL; if ((arg->category_inverted | arg->category_mask | arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) return -EINVAL; arg->start = untagged_addr((unsigned long)arg->start); arg->end = untagged_addr((unsigned long)arg->end); arg->vec = untagged_addr((unsigned long)arg->vec); /* Validate memory pointers */ if (!IS_ALIGNED(arg->start, PAGE_SIZE)) return -EINVAL; if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ arg->end = ALIGN(arg->end, PAGE_SIZE); arg->walk_end = 0; if (!arg->max_pages) arg->max_pages = ULONG_MAX; return 0; } static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, unsigned long uargl) { struct pm_scan_arg __user *uarg = (void __user *)uargl; if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) return -EFAULT; return 0; } static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) { if (!p->arg.vec_len) return 0; p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, p->arg.vec_len); p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), GFP_KERNEL); if (!p->vec_buf) return -ENOMEM; p->vec_buf->start = p->vec_buf->end = 0; p->vec_out = (struct page_region __user *)(long)p->arg.vec; return 0; } static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) { const struct page_region *buf = p->vec_buf; long n = p->vec_buf_index; if (!p->vec_buf) return 0; if (buf[n].end != buf[n].start) n++; if (!n) return 0; if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) return -EFAULT; p->arg.vec_len -= n; p->vec_out += n; p->vec_buf_index = 0; p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); p->vec_buf->start = p->vec_buf->end = 0; return n; } static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) { struct pagemap_scan_private p = {0}; unsigned long walk_start; size_t n_ranges_out = 0; int ret; ret = pagemap_scan_get_args(&p.arg, uarg); if (ret) return ret; p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | p.arg.return_mask; ret = pagemap_scan_init_bounce_buffer(&p); if (ret) return ret; for (walk_start = p.arg.start; walk_start < p.arg.end; walk_start = p.arg.walk_end) { struct mmu_notifier_range range; long n_out; if (fatal_signal_pending(current)) { ret = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) break; /* Protection change for the range is going to happen. */ if (p.arg.flags & PM_SCAN_WP_MATCHING) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, walk_start, p.arg.end); mmu_notifier_invalidate_range_start(&range); } ret = walk_page_range(mm, walk_start, p.arg.end, &pagemap_scan_ops, &p); if (p.arg.flags & PM_SCAN_WP_MATCHING) mmu_notifier_invalidate_range_end(&range); mmap_read_unlock(mm); n_out = pagemap_scan_flush_buffer(&p); if (n_out < 0) ret = n_out; else n_ranges_out += n_out; if (ret != -ENOSPC) break; if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) break; } /* ENOSPC signifies early stop (buffer full) from the walk. */ if (!ret || ret == -ENOSPC) ret = n_ranges_out; /* The walk_end isn't set when ret is zero */ if (!p.arg.walk_end) p.arg.walk_end = p.arg.end; if (pagemap_scan_writeback_args(&p.arg, uarg)) ret = -EFAULT; kfree(p.vec_buf); return ret; } static long do_pagemap_cmd(struct file *file, unsigned int cmd, unsigned long arg) { struct mm_struct *mm = file->private_data; switch (cmd) { case PAGEMAP_SCAN: return do_pagemap_scan(mm, arg); default: return -EINVAL; } } const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, .unlocked_ioctl = do_pagemap_cmd, .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA struct numa_maps { unsigned long pages; unsigned long anon; unsigned long active; unsigned long writeback; unsigned long mapcount_max; unsigned long dirty; unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; struct numa_maps_private { struct proc_maps_private proc_maps; struct numa_maps md; }; static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { struct folio *folio = page_folio(page); int count = folio_precise_page_mapcount(folio, page); md->pages += nr_pages; if (pte_dirty || folio_test_dirty(folio)) md->dirty += nr_pages; if (folio_test_swapcache(folio)) md->swapcache += nr_pages; if (folio_test_active(folio) || folio_test_unevictable(folio)) md->active += nr_pages; if (folio_test_writeback(folio)) md->writeback += nr_pages; if (folio_test_anon(folio)) md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; md->node[folio_nid(folio)] += nr_pages; } static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pte_present(pte)) return NULL; page = vm_normal_page(vma, addr, pte); if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static struct page *can_gather_numa_stats_pmd(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) { struct page *page; int nid; if (!pmd_present(pmd)) return NULL; page = vm_normal_page_pmd(vma, addr, pmd); if (!page) return NULL; if (PageReserved(page)) return NULL; nid = page_to_nid(page); if (!node_isset(nid, node_states[N_MEMORY])) return NULL; return page; } #endif static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct numa_maps *md = walk->private; struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte; pte_t *pte; #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { struct page *page; page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; } #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { walk->action = ACTION_AGAIN; return 0; } do { pte_t ptent = ptep_get(pte); struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte); struct numa_maps *md; struct page *page; if (!pte_present(huge_pte)) return 0; page = pte_page(huge_pte); md = walk->private; gather_stats(page, md, pte_dirty(huge_pte), 1); return 0; } #else static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, .walk_lock = PGWALK_RDLOCK, }; /* * Display pages allocated per node and memory policy via /proc. */ static int show_numa_map(struct seq_file *m, void *v) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; char buffer[64]; struct mempolicy *pol; pgoff_t ilx; int nid; if (!mm) return 0; /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); } else { mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); } seq_printf(m, "%08lx %s", vma->vm_start, buffer); if (file) { seq_puts(m, " file="); seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); /* mmap_lock is held by m_start */ walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) goto out; if (md->anon) seq_printf(m, " anon=%lu", md->anon); if (md->dirty) seq_printf(m, " dirty=%lu", md->dirty); if (md->pages != md->anon && md->pages != md->dirty) seq_printf(m, " mapped=%lu", md->pages); if (md->mapcount_max > 1) seq_printf(m, " mapmax=%lu", md->mapcount_max); if (md->swapcache) seq_printf(m, " swapcache=%lu", md->swapcache); if (md->active < md->pages && !is_vm_hugetlb_page(vma)) seq_printf(m, " active=%lu", md->active); if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); for_each_node_state(nid, N_MEMORY) if (md->node[nid]) seq_printf(m, " N%d=%lu", nid, md->node[nid]); seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); return 0; } static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map, }; static int pid_numa_maps_open(struct inode *inode, struct file *file) { return proc_maps_open(inode, file, &proc_pid_numa_maps_op, sizeof(struct numa_maps_private)); } const struct file_operations proc_pid_numa_maps_operations = { .open = pid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, }; #endif /* CONFIG_NUMA */
2243 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Dynamic loading of modules into the kernel. * * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996 * Rewritten again by Rusty Russell, 2002 */ #ifndef _LINUX_MODULE_H #define _LINUX_MODULE_H #include <linux/list.h> #include <linux/stat.h> #include <linux/buildid.h> #include <linux/compiler.h> #include <linux/cache.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/elf.h> #include <linux/stringify.h> #include <linux/kobject.h> #include <linux/moduleparam.h> #include <linux/jump_label.h> #include <linux/export.h> #include <linux/rbtree_latch.h> #include <linux/error-injection.h> #include <linux/tracepoint-defs.h> #include <linux/srcu.h> #include <linux/static_call_types.h> #include <linux/dynamic_debug.h> #include <linux/percpu.h> #include <asm/module.h> #define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN struct modversion_info { unsigned long crc; char name[MODULE_NAME_LEN]; }; struct module; struct exception_table_entry; struct module_kobject { struct kobject kobj; struct module *mod; struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; } __randomize_layout; struct module_attribute { struct attribute attr; ssize_t (*show)(const struct module_attribute *, struct module_kobject *, char *); ssize_t (*store)(const struct module_attribute *, struct module_kobject *, const char *, size_t count); void (*setup)(struct module *, const char *); int (*test)(struct module *); void (*free)(struct module *); }; struct module_version_attribute { struct module_attribute mattr; const char *module_name; const char *version; }; extern ssize_t __modver_version_show(const struct module_attribute *, struct module_kobject *, char *); extern const struct module_attribute module_uevent; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); extern void cleanup_module(void); #ifndef MODULE /** * module_init() - driver initialization entry point * @x: function to be run at kernel boot time or module insertion * * module_init() will either be called during do_initcalls() (if * builtin) or at module insertion time (if a module). There can only * be one per module. */ #define module_init(x) __initcall(x); /** * module_exit() - driver exit entry point * @x: function to be run when driver is removed * * module_exit() will wrap the driver clean-up code * with cleanup_module() when used with rmmod when * the driver is a module. If the driver is statically * compiled into the kernel, module_exit() has no effect. * There can only be one per module. */ #define module_exit(x) __exitcall(x); #else /* MODULE */ /* * In most cases loadable modules do not need custom * initcall levels. There are still some valid cases where * a driver may be needed early if built in, and does not * matter when built as a loadable module. Like bus * snooping debug drivers. */ #define early_initcall(fn) module_init(fn) #define core_initcall(fn) module_init(fn) #define core_initcall_sync(fn) module_init(fn) #define postcore_initcall(fn) module_init(fn) #define postcore_initcall_sync(fn) module_init(fn) #define arch_initcall(fn) module_init(fn) #define subsys_initcall(fn) module_init(fn) #define subsys_initcall_sync(fn) module_init(fn) #define fs_initcall(fn) module_init(fn) #define fs_initcall_sync(fn) module_init(fn) #define rootfs_initcall(fn) module_init(fn) #define device_initcall(fn) module_init(fn) #define device_initcall_sync(fn) module_init(fn) #define late_initcall(fn) module_init(fn) #define late_initcall_sync(fn) module_init(fn) #define console_initcall(fn) module_init(fn) /* Each module must use one module_init(). */ #define module_init(initfn) \ static inline initcall_t __maybe_unused __inittest(void) \ { return initfn; } \ int init_module(void) __copy(initfn) \ __attribute__((alias(#initfn))); \ ___ADDRESSABLE(init_module, __initdata); /* This is only required if you want to be unloadable. */ #define module_exit(exitfn) \ static inline exitcall_t __maybe_unused __exittest(void) \ { return exitfn; } \ void cleanup_module(void) __copy(exitfn) \ __attribute__((alias(#exitfn))); \ ___ADDRESSABLE(cleanup_module, __exitdata); #endif /* This means "can be init if no module support, otherwise module load may call it." */ #ifdef CONFIG_MODULES #define __init_or_module #define __initdata_or_module #define __initconst_or_module #define __INIT_OR_MODULE .text #define __INITDATA_OR_MODULE .data #define __INITRODATA_OR_MODULE .section ".rodata","a",%progbits #else #define __init_or_module __init #define __initdata_or_module __initdata #define __initconst_or_module __initconst #define __INIT_OR_MODULE __INIT #define __INITDATA_OR_MODULE __INITDATA #define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ /* Generic info of form tag = "info" */ #define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info) /* For userspace: you can also call me... */ #define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias) /* Soft module dependencies. See man modprobe.d for details. * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz") */ #define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep) /* * Weak module dependencies. See man modprobe.d for details. * Example: MODULE_WEAKDEP("module-foo") */ #define MODULE_WEAKDEP(_weakdep) MODULE_INFO(weakdep, _weakdep) /* * MODULE_FILE is used for generating modules.builtin * So, make it no-op when this is being built as a module */ #ifdef MODULE #define MODULE_FILE #else #define MODULE_FILE MODULE_INFO(file, KBUILD_MODFILE); #endif /* * The following license idents are currently accepted as indicating free * software modules * * "GPL" [GNU Public License v2] * "GPL v2" [GNU Public License v2] * "GPL and additional rights" [GNU Public License v2 rights and more] * "Dual BSD/GPL" [GNU Public License v2 * or BSD license choice] * "Dual MIT/GPL" [GNU Public License v2 * or MIT license choice] * "Dual MPL/GPL" [GNU Public License v2 * or Mozilla license choice] * * The following other idents are available * * "Proprietary" [Non free products] * * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are * merely stating that the module is licensed under the GPL v2, but are not * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there * are two variants is a historic and failed attempt to convey more * information in the MODULE_LICENSE string. For module loading the * "only/or later" distinction is completely irrelevant and does neither * replace the proper license identifiers in the corresponding source file * nor amends them in any way. The sole purpose is to make the * 'Proprietary' flagging work and to refuse to bind symbols which are * exported with EXPORT_SYMBOL_GPL when a non free module is loaded. * * In the same way "BSD" is not a clear license information. It merely * states, that the module is licensed under one of the compatible BSD * license variants. The detailed and correct license information is again * to be found in the corresponding source files. * * There are dual licensed components, but when running with Linux it is the * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL * is a GPL combined work. * * This exists for several reasons * 1. So modinfo can show license info for users wanting to vet their setup * is free * 2. So the community can ignore bug reports including proprietary modules * 3. So vendors can do likewise based on their own policies */ #define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license) /* * Author(s), use "Name <email>" or just "Name", for multiple * authors use multiple MODULE_AUTHOR() statements/lines. */ #define MODULE_AUTHOR(_author) MODULE_INFO(author, _author) /* What your module does. */ #define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) #ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ extern typeof(name) __mod_device_table__##type##__##name \ __attribute__ ((unused, alias(__stringify(name)))) #else /* !MODULE */ #define MODULE_DEVICE_TABLE(type, name) #endif /* Version of form [<epoch>:]<version>[-<extra-version>]. * Or for CVS/RCS ID version, everything but the number is stripped. * <epoch>: A (small) unsigned integer which allows you to start versions * anew. If not mentioned, it's zero. eg. "2:1.0" is after * "1:2.0". * <version>: The <version> may contain only alphanumerics and the * character `.'. Ordered by numeric sort for numeric parts, * ascii sort for ascii parts (as per RPM or DEB algorithm). * <extraversion>: Like <version>, but inserted for local * customizations, eg "rh3" or "rusty1". * Using this automatically adds a checksum of the .c files and the * local headers in "srcversion". */ #if defined(MODULE) || !defined(CONFIG_SYSFS) #define MODULE_VERSION(_version) MODULE_INFO(version, _version) #else #define MODULE_VERSION(_version) \ MODULE_INFO(version, _version); \ static const struct module_version_attribute __modver_attr \ __used __section("__modver") \ __aligned(__alignof__(struct module_version_attribute)) \ = { \ .mattr = { \ .attr = { \ .name = "version", \ .mode = S_IRUGO, \ }, \ .show = __modver_version_show, \ }, \ .module_name = KBUILD_MODNAME, \ .version = _version, \ } #endif /* Optional firmware file (or files) needed by the module * format is simply firmware file name. Multiple firmware * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) #define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, ns) struct notifier_block; #ifdef CONFIG_MODULES extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); #define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x)))) /* modules using other modules: kdb wants to see this. */ struct module_use { struct list_head source_list; struct list_head target_list; struct module *source, *target; }; enum module_state { MODULE_STATE_LIVE, /* Normal state. */ MODULE_STATE_COMING, /* Full formed, running module_init. */ MODULE_STATE_GOING, /* Going away. */ MODULE_STATE_UNFORMED, /* Still setting it up. */ }; struct mod_tree_node { struct module *mod; struct latch_tree_node node; }; enum mod_mem_type { MOD_TEXT = 0, MOD_DATA, MOD_RODATA, MOD_RO_AFTER_INIT, MOD_INIT_TEXT, MOD_INIT_DATA, MOD_INIT_RODATA, MOD_MEM_NUM_TYPES, MOD_INVALID = -1, }; #define mod_mem_type_is_init(type) \ ((type) == MOD_INIT_TEXT || \ (type) == MOD_INIT_DATA || \ (type) == MOD_INIT_RODATA) #define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type)) #define mod_mem_type_is_text(type) \ ((type) == MOD_TEXT || \ (type) == MOD_INIT_TEXT) #define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type)) #define mod_mem_type_is_core_data(type) \ (mod_mem_type_is_core(type) && \ mod_mem_type_is_data(type)) #define for_each_mod_mem_type(type) \ for (enum mod_mem_type (type) = 0; \ (type) < MOD_MEM_NUM_TYPES; (type)++) #define for_class_mod_mem_type(type, class) \ for_each_mod_mem_type(type) \ if (mod_mem_type_is_##class(type)) struct module_memory { void *base; void *rw_copy; bool is_rox; unsigned int size; #ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_node mtn; #endif }; #ifdef CONFIG_MODULES_TREE_LOOKUP /* Only touch one cacheline for common rbtree-for-core-layout case. */ #define __module_memory_align ____cacheline_aligned #else #define __module_memory_align #endif struct mod_kallsyms { Elf_Sym *symtab; unsigned int num_symtab; char *strtab; char *typetab; }; #ifdef CONFIG_LIVEPATCH /** * struct klp_modinfo - ELF information preserved from the livepatch module * * @hdr: ELF header * @sechdrs: Section header table * @secstrings: String table for the section headers * @symndx: The symbol table section index */ struct klp_modinfo { Elf_Ehdr hdr; Elf_Shdr *sechdrs; char *secstrings; unsigned int symndx; }; #endif struct module { enum module_state state; /* Member of list of modules */ struct list_head list; /* Unique handle for this module */ char name[MODULE_NAME_LEN]; #ifdef CONFIG_STACKTRACE_BUILD_ID /* Module build ID */ unsigned char build_id[BUILD_ID_SIZE_MAX]; #endif /* Sysfs stuff. */ struct module_kobject mkobj; struct module_attribute *modinfo_attrs; const char *version; const char *srcversion; struct kobject *holders_dir; /* Exported symbols */ const struct kernel_symbol *syms; const u32 *crcs; unsigned int num_syms; #ifdef CONFIG_ARCH_USES_CFI_TRAPS s32 *kcfi_traps; s32 *kcfi_traps_end; #endif /* Kernel parameters. */ #ifdef CONFIG_SYSFS struct mutex param_lock; #endif struct kernel_param *kp; unsigned int num_kp; /* GPL-only exported symbols. */ unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; const u32 *gpl_crcs; bool using_gplonly_symbols; #ifdef CONFIG_MODULE_SIG /* Signature was verified. */ bool sig_ok; #endif bool async_probe_requested; /* Exception table */ unsigned int num_exentries; struct exception_table_entry *extable; /* Startup function. */ int (*init)(void); struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align; /* Arch-specific module values */ struct mod_arch_specific arch; unsigned long taints; /* same bits as kernel:taint_flags */ #ifdef CONFIG_GENERIC_BUG /* Support for BUG */ unsigned num_bugs; struct list_head bug_list; struct bug_entry *bug_table; #endif #ifdef CONFIG_KALLSYMS /* Protected by RCU and/or module_mutex: use rcu_dereference() */ struct mod_kallsyms __rcu *kallsyms; struct mod_kallsyms core_kallsyms; /* Section attributes */ struct module_sect_attrs *sect_attrs; /* Notes attributes */ struct module_notes_attrs *notes_attrs; #endif /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; #ifdef CONFIG_SMP /* Per-cpu data. */ void __percpu *percpu; unsigned int percpu_size; #endif void *noinstr_text_start; unsigned int noinstr_text_size; #ifdef CONFIG_TRACEPOINTS unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif #ifdef CONFIG_TREE_SRCU unsigned int num_srcu_structs; struct srcu_struct **srcu_struct_ptrs; #endif #ifdef CONFIG_BPF_EVENTS unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif #ifdef CONFIG_DEBUG_INFO_BTF_MODULES unsigned int btf_data_size; unsigned int btf_base_data_size; void *btf_data; void *btf_base_data; #endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; #endif #ifdef CONFIG_TRACING unsigned int num_trace_bprintk_fmt; const char **trace_bprintk_fmt_start; #endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; struct trace_eval_map **trace_evals; unsigned int num_trace_evals; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD unsigned int num_ftrace_callsites; unsigned long *ftrace_callsites; #endif #ifdef CONFIG_KPROBES void *kprobes_text_start; unsigned int kprobes_text_size; unsigned long *kprobe_blacklist; unsigned int num_kprobe_blacklist; #endif #ifdef CONFIG_HAVE_STATIC_CALL_INLINE int num_static_call_sites; struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) int num_kunit_init_suites; struct kunit_suite **kunit_init_suites; int num_kunit_suites; struct kunit_suite **kunit_suites; #endif #ifdef CONFIG_LIVEPATCH bool klp; /* Is this a livepatch module? */ bool klp_alive; /* ELF information */ struct klp_modinfo *klp_info; #endif #ifdef CONFIG_PRINTK_INDEX unsigned int printk_index_size; struct pi_entry **printk_index_start; #endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head source_list; /* What modules do I depend on? */ struct list_head target_list; /* Destruction function. */ void (*exit)(void); atomic_t refcnt; #endif #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; unsigned int num_ctors; #endif #ifdef CONFIG_FUNCTION_ERROR_INJECTION struct error_injection_entry *ei_funcs; unsigned int num_ei_funcs; #endif #ifdef CONFIG_DYNAMIC_DEBUG_CORE struct _ddebug_info dyndbg_info; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif #ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) { return sym->st_value; } #endif /* FIXME: It'd be nice to isolate modules during init, too, so they aren't used before they (may) fail. But presently too much code (IDE & SCSI) require entry into the module during init.*/ static inline bool module_is_live(struct module *mod) { return mod->state != MODULE_STATE_GOING; } static inline bool module_is_coming(struct module *mod) { return mod->state == MODULE_STATE_COMING; } struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr); bool is_module_percpu_address(unsigned long addr); bool is_module_text_address(unsigned long addr); static inline bool within_module_mem_type(unsigned long addr, const struct module *mod, enum mod_mem_type type) { unsigned long base, size; base = (unsigned long)mod->mem[type].base; size = mod->mem[type].size; return addr - base < size; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, core) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, init) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return within_module_init(addr, mod) || within_module_core(addr, mod); } /* Search for module by name: must be in a RCU-sched critical section. */ struct module *find_module(const char *name); extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD int module_refcount(struct module *mod); void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(__stringify(x)) void symbol_put_addr(void *addr); /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ extern void __module_get(struct module *module); /** * try_module_get() - take module refcount unless module is being removed * @module: the module we should check for * * Only try to get a module reference count if the module is not being removed. * This call will fail if the module is in the process of being removed. * * Care must also be taken to ensure the module exists and is alive prior to * usage of this call. This can be gauranteed through two means: * * 1) Direct protection: you know an earlier caller must have increased the * module reference through __module_get(). This can typically be achieved * by having another entity other than the module itself increment the * module reference count. * * 2) Implied protection: there is an implied protection against module * removal. An example of this is the implied protection used by kernfs / * sysfs. The sysfs store / read file operations are guaranteed to exist * through the use of kernfs's active reference (see kernfs_active()) and a * sysfs / kernfs file removal cannot happen unless the same file is not * active. Therefore, if a sysfs file is being read or written to the module * which created it must still exist. It is therefore safe to use * try_module_get() on module sysfs store / read ops. * * One of the real values to try_module_get() is the module_is_live() check * which ensures that the caller of try_module_get() can yield to userspace * module removal requests and gracefully fail if the module is on its way out. * * Returns true if the reference count was successfully incremented. */ extern bool try_module_get(struct module *module); /** * module_put() - release a reference count to a module * @module: the module we should release a reference count for * * If you successfully bump a reference count to a module with try_module_get(), * when you are finished you must call module_put() to release that reference * count. */ extern void module_put(struct module *module); #else /*!CONFIG_MODULE_UNLOAD*/ static inline bool try_module_get(struct module *module) { return !module || module_is_live(module); } static inline void module_put(struct module *module) { } static inline void __module_get(struct module *module) { } #define symbol_put(x) do { } while (0) #define symbol_put_addr(p) do { } while (0) #endif /* CONFIG_MODULE_UNLOAD */ /* This is a #define so the string doesn't get put in every .o file */ #define module_name(mod) \ ({ \ struct module *__mod = (mod); \ __mod ? __mod->name : "kernel"; \ }) /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); int register_module_notifier(struct notifier_block *nb); int unregister_module_notifier(struct notifier_block *nb); extern void print_modules(void); static inline bool module_requested_async_probing(struct module *module) { return module && module->async_probe_requested; } static inline bool is_livepatch_module(struct module *mod) { #ifdef CONFIG_LIVEPATCH return mod->klp; #else return false; #endif } void set_module_sig_enforced(void); void *__module_writable_address(struct module *mod, void *loc); static inline void *module_writable_address(struct module *mod, void *loc) { if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || mod->state != MODULE_STATE_UNFORMED) return loc; return __module_writable_address(mod, loc); } #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) { return NULL; } static inline struct module *__module_text_address(unsigned long addr) { return NULL; } static inline bool is_module_address(unsigned long addr) { return false; } static inline bool is_module_percpu_address(unsigned long addr) { return false; } static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { return false; } static inline bool is_module_text_address(unsigned long addr) { return false; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return false; } /* Get/put a kernel symbol (calls should be symmetric) */ #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) #define symbol_put(x) do { } while (0) #define symbol_put_addr(x) do { } while (0) static inline void __module_get(struct module *module) { } static inline bool try_module_get(struct module *module) { return true; } static inline void module_put(struct module *module) { } #define module_name(mod) "kernel" static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ return 0; } static inline int unregister_module_notifier(struct notifier_block *nb) { return 0; } #define module_put_and_kthread_exit(code) kthread_exit(code) static inline void print_modules(void) { } static inline bool module_requested_async_probing(struct module *module) { return false; } static inline void set_module_sig_enforced(void) { } /* Dereference module function descriptor */ static inline void *dereference_module_function_descriptor(struct module *mod, void *ptr) { return ptr; } static inline bool module_is_coming(struct module *mod) { return false; } static inline void *module_writable_address(struct module *mod, void *loc) { return loc; } #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS extern struct kset *module_kset; extern const struct kobj_type module_ktype; #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */ #define __MODULE_STRING(x) __stringify(x) #ifdef CONFIG_GENERIC_BUG void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); void module_bug_cleanup(struct module *); #else /* !CONFIG_GENERIC_BUG */ static inline void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { } static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ #ifdef CONFIG_MITIGATION_RETPOLINE extern bool retpoline_module_ok(bool has_retpoline); #else static inline bool retpoline_module_ok(bool has_retpoline) { return true; } #endif #ifdef CONFIG_MODULE_SIG bool is_module_sig_enforced(void); static inline bool module_sig_ok(struct module *module) { return module->sig_ok; } #else /* !CONFIG_MODULE_SIG */ static inline bool is_module_sig_enforced(void) { return false; } static inline bool module_sig_ok(struct module *module) { return true; } #endif /* CONFIG_MODULE_SIG */ #if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS) int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data); /* For kallsyms to ask for address resolution. namebuf should be at * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ int module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if * symnum out of range. */ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported); /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); #else /* CONFIG_MODULES && CONFIG_KALLSYMS */ static inline int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } /* For kallsyms to ask for address resolution. NULL means not found. */ static inline int module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { return 0; } static inline int lookup_module_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) { return -ERANGE; } static inline unsigned long module_kallsyms_lookup_name(const char *name) { return 0; } static inline unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { return 0; } #endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */
16 16 16 16 16 16 16 16 15 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023 * * Design * ====== * * See https://www.chronox.de/jent.html * * License * ======= * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * This Jitterentropy RNG is based on the jitterentropy library * version 3.4.0 provided at https://www.chronox.de/jent.html */ #ifdef __OPTIMIZE__ #error "The CPU Jitter random number generator must not be compiled with optimizations. See documentation. Use the compiler switch -O0 for compiling jitterentropy.c." #endif typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { /* SHA3-256 is used as conditioner */ #define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ void *hash_state; /* SENSITIVE hash state entropy pool */ __u64 prev_time; /* SENSITIVE Previous time stamp */ __u64 last_delta; /* SENSITIVE stuck test */ __s64 last_delta2; /* SENSITIVE stuck test */ unsigned int flags; /* Flags used to initialize */ unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_ACCESSLOOPS 128 #define JENT_MEMORY_SIZE \ (CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS * \ CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE) unsigned char *mem; /* Memory access location with size of * memblocks * memblocksize */ unsigned int memlocation; /* Pointer to byte in *mem */ unsigned int memblocks; /* Number of memory blocks in *mem */ unsigned int memblocksize; /* Size of one memory block in bytes */ unsigned int memaccessloops; /* Number of memory accesses per random * bit generation */ /* Repetition Count Test */ unsigned int rct_count; /* Number of stuck values */ /* Adaptive Proportion Test cutoff values */ unsigned int apt_cutoff; /* Intermittent health test failure */ unsigned int apt_cutoff_permanent; /* Permanent health test failure */ #define JENT_APT_WINDOW_SIZE 512 /* Data window size */ /* LSB of time stamp to process */ #define JENT_APT_LSB 16 #define JENT_APT_WORD_MASK (JENT_APT_LSB - 1) unsigned int apt_observations; /* Number of collected observations */ unsigned int apt_count; /* APT counter */ unsigned int apt_base; /* APT base reference */ unsigned int health_failure; /* Record health failure */ unsigned int apt_base_set:1; /* APT base reference set? */ }; /* Flags that can be used to initialize the RNG */ #define JENT_DISABLE_MEMORY_ACCESS (1<<2) /* Disable memory access for more * entropy, saves MEMORY_SIZE RAM for * entropy collector */ /* -- error codes for init function -- */ #define JENT_ENOTIME 1 /* Timer service not available */ #define JENT_ECOARSETIME 2 /* Timer too coarse for RNG */ #define JENT_ENOMONOTONIC 3 /* Timer is not monotonic increasing */ #define JENT_EVARVAR 5 /* Timer does not produce variations of * variations (2nd derivation of time is * zero). */ #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ #define JENT_ERCT 10 /* RCT failed during initialization */ #define JENT_EHASH 11 /* Hash self test failed */ #define JENT_EMEM 12 /* Can't allocate memory for initialization */ #define JENT_RCT_FAILURE 1 /* Failure in RCT health test. */ #define JENT_APT_FAILURE 2 /* Failure in APT health test. */ #define JENT_PERMANENT_FAILURE_SHIFT 16 #define JENT_PERMANENT_FAILURE(x) (x << JENT_PERMANENT_FAILURE_SHIFT) #define JENT_RCT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_RCT_FAILURE) #define JENT_APT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_APT_FAILURE) /* * The output n bits can receive more than n bits of min entropy, of course, * but the fixed output of the conditioning function can only asymptotically * approach the output size bits of min entropy, not attain that bound. Random * maps will tend to have output collisions, which reduces the creditable * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). * * The value "64" is justified in Appendix A.4 of the current 90C draft, * and aligns with NIST's in "epsilon" definition in this document, which is * that a string can be considered "full entropy" if you can bound the min * entropy in each bit of output to at least 1-epsilon, where epsilon is * required to be <= 2^(-32). */ #define JENT_ENTROPY_SAFETY_FACTOR 64 #include <linux/fips.h> #include <linux/minmax.h> #include "jitterentropy.h" /*************************************************************************** * Adaptive Proportion Test * * This test complies with SP800-90B section 4.4.2. ***************************************************************************/ /* * See the SP 800-90B comment #10b for the corrected cutoff for the SP 800-90B * APT. * https://www.untruth.org/~josh/sp80090b/UL%20SP800-90B-final%20comments%20v1.9%2020191212.pdf * In the syntax of R, this is C = 2 + qbinom(1 − 2^(−30), 511, 2^(-1/osr)). * (The original formula wasn't correct because the first symbol must * necessarily have been observed, so there is no chance of observing 0 of these * symbols.) * * For the alpha < 2^-53, R cannot be used as it uses a float data type without * arbitrary precision. A SageMath script is used to calculate those cutoff * values. * * For any value above 14, this yields the maximal allowable value of 512 * (by FIPS 140-2 IG 7.19 Resolution # 16, we cannot choose a cutoff value that * renders the test unable to fail). */ static const unsigned int jent_apt_cutoff_lookup[15] = { 325, 422, 459, 477, 488, 494, 499, 502, 505, 507, 508, 509, 510, 511, 512 }; static const unsigned int jent_apt_cutoff_permanent_lookup[15] = { 355, 447, 479, 494, 502, 507, 510, 512, 512, 512, 512, 512, 512, 512, 512 }; #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) static void jent_apt_init(struct rand_data *ec, unsigned int osr) { /* * Establish the apt_cutoff based on the presumed entropy rate of * 1/osr. */ if (osr >= ARRAY_SIZE(jent_apt_cutoff_lookup)) { ec->apt_cutoff = jent_apt_cutoff_lookup[ ARRAY_SIZE(jent_apt_cutoff_lookup) - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[ ARRAY_SIZE(jent_apt_cutoff_permanent_lookup) - 1]; } else { ec->apt_cutoff = jent_apt_cutoff_lookup[osr - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[osr - 1]; } } /* * Reset the APT counter * * @ec [in] Reference to entropy collector */ static void jent_apt_reset(struct rand_data *ec, unsigned int delta_masked) { /* Reset APT counter */ ec->apt_count = 0; ec->apt_base = delta_masked; ec->apt_observations = 0; } /* * Insert a new entropy event into APT * * @ec [in] Reference to entropy collector * @delta_masked [in] Masked time delta to process */ static void jent_apt_insert(struct rand_data *ec, unsigned int delta_masked) { /* Initialize the base reference */ if (!ec->apt_base_set) { ec->apt_base = delta_masked; ec->apt_base_set = 1; return; } if (delta_masked == ec->apt_base) { ec->apt_count++; /* Note, ec->apt_count starts with one. */ if (ec->apt_count >= ec->apt_cutoff_permanent) ec->health_failure |= JENT_APT_FAILURE_PERMANENT; else if (ec->apt_count >= ec->apt_cutoff) ec->health_failure |= JENT_APT_FAILURE; } ec->apt_observations++; if (ec->apt_observations >= JENT_APT_WINDOW_SIZE) jent_apt_reset(ec, delta_masked); } /*************************************************************************** * Stuck Test and its use as Repetition Count Test * * The Jitter RNG uses an enhanced version of the Repetition Count Test * (RCT) specified in SP800-90B section 4.4.1. Instead of counting identical * back-to-back values, the input to the RCT is the counting of the stuck * values during the generation of one Jitter RNG output block. * * The RCT is applied with an alpha of 2^{-30} compliant to FIPS 140-2 IG 9.8. * * During the counting operation, the Jitter RNG always calculates the RCT * cut-off value of C. If that value exceeds the allowed cut-off value, * the Jitter RNG output block will be calculated completely but discarded at * the end. The caller of the Jitter RNG is informed with an error code. ***************************************************************************/ /* * Repetition Count Test as defined in SP800-90B section 4.4.1 * * @ec [in] Reference to entropy collector * @stuck [in] Indicator whether the value is stuck */ static void jent_rct_insert(struct rand_data *ec, int stuck) { if (stuck) { ec->rct_count++; /* * The cutoff value is based on the following consideration: * alpha = 2^-30 or 2^-60 as recommended in SP800-90B. * In addition, we require an entropy value H of 1/osr as this * is the minimum entropy required to provide full entropy. * Note, we collect (DATA_SIZE_BITS + ENTROPY_SAFETY_FACTOR)*osr * deltas for inserting them into the entropy pool which should * then have (close to) DATA_SIZE_BITS bits of entropy in the * conditioned output. * * Note, ec->rct_count (which equals to value B in the pseudo * code of SP800-90B section 4.4.1) starts with zero. Hence * we need to subtract one from the cutoff value as calculated * following SP800-90B. Thus C = ceil(-log_2(alpha)/H) = 30*osr * or 60*osr. */ if ((unsigned int)ec->rct_count >= (60 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE_PERMANENT; } else if ((unsigned int)ec->rct_count >= (30 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE; } } else { /* Reset RCT */ ec->rct_count = 0; } } static inline __u64 jent_delta(__u64 prev, __u64 next) { #define JENT_UINT64_MAX (__u64)(~((__u64) 0)) return (prev < next) ? (next - prev) : (JENT_UINT64_MAX - prev + 1 + next); } /* * Stuck test by checking the: * 1st derivative of the jitter measurement (time delta) * 2nd derivative of the jitter measurement (delta of time deltas) * 3rd derivative of the jitter measurement (delta of delta of time deltas) * * All values must always be non-zero. * * @ec [in] Reference to entropy collector * @current_delta [in] Jitter time delta * * @return * 0 jitter measurement not stuck (good bit) * 1 jitter measurement stuck (reject bit) */ static int jent_stuck(struct rand_data *ec, __u64 current_delta) { __u64 delta2 = jent_delta(ec->last_delta, current_delta); __u64 delta3 = jent_delta(ec->last_delta2, delta2); ec->last_delta = current_delta; ec->last_delta2 = delta2; /* * Insert the result of the comparison of two back-to-back time * deltas. */ jent_apt_insert(ec, current_delta); if (!current_delta || !delta2 || !delta3) { /* RCT with a stuck bit */ jent_rct_insert(ec, 1); return 1; } /* RCT with a non-stuck bit */ jent_rct_insert(ec, 0); return 0; } /* * Report any health test failures * * @ec [in] Reference to entropy collector * * @return a bitmask indicating which tests failed * 0 No health test failure * 1 RCT failure * 2 APT failure * 1<<JENT_PERMANENT_FAILURE_SHIFT RCT permanent failure * 2<<JENT_PERMANENT_FAILURE_SHIFT APT permanent failure */ static unsigned int jent_health_failure(struct rand_data *ec) { /* Test is only enabled in FIPS mode */ if (!fips_enabled) return 0; return ec->health_failure; } /*************************************************************************** * Noise sources ***************************************************************************/ /* * Update of the loop count used for the next round of * an entropy collection. * * Input: * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; unsigned int i = 0; unsigned int mask = (1<<bits) - 1; jent_get_nstime(&time); /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. */ for (i = 0; ((DATA_SIZE_BITS + bits - 1) / bits) > i; i++) { shuffle ^= time & mask; time = time >> bits; } /* * We add a lower boundary value to ensure we have a minimum * RNG loop count. */ return (shuffle + (1<<min)); } /* * CPU Jitter noise source -- this is the noise source based on the CPU * execution time jitter * * This function injects the individual bits of the time value into the * entropy pool using a hash. * * ec [in] entropy collector * time [in] time stamp to be injected * stuck [in] Is the time stamp identified as stuck? * * Output: * updated hash context in the entropy collector or error code */ static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { #define SHA3_HASH_LOOP (1<<3) struct { int rct_count; unsigned int apt_observations; unsigned int apt_count; unsigned int apt_base; } addtl = { ec->rct_count, ec->apt_observations, ec->apt_count, ec->apt_base }; return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), SHA3_HASH_LOOP, stuck); } /* * Memory Access noise source -- this is a noise source based on variations in * memory access times * * This function performs memory accesses which will add to the timing * variations due to an unknown amount of CPU wait states that need to be * added when accessing memory. The memory size should be larger than the L1 * caches as outlined in the documentation and the associated testing. * * The L1 cache has a very high bandwidth, albeit its access rate is usually * slower than accessing CPU registers. Therefore, L1 accesses only add minimal * variations as the CPU has hardly to wait. Starting with L2, significant * variations are added because L2 typically does not belong to the CPU any more * and therefore a wider range of CPU wait states is necessary for accesses. * L3 and real memory accesses have even a wider range of wait states. However, * to reliably access either L3 or memory, the ec->mem memory must be quite * large which is usually not desirable. * * @ec [in] Reference to the entropy collector with the memory access data -- if * the reference to the memory block to be accessed is NULL, this noise * source is disabled * @loop_cnt [in] if a value not equal to 0 is set, use the given value * number of loops to perform the LFSR */ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) { unsigned int wrap = 0; __u64 i = 0; #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; wrap = ec->memblocksize * ec->memblocks; /* * testing purposes -- allow test app to set the counter, not * needed during runtime */ if (loop_cnt) acc_loop_cnt = loop_cnt; for (i = 0; i < (ec->memaccessloops + acc_loop_cnt); i++) { unsigned char *tmpval = ec->mem + ec->memlocation; /* * memory access: just add 1 to one byte, * wrap at 255 -- memory access implies read * from and write to memory location */ *tmpval = (*tmpval + 1) & 0xff; /* * Addition of memblocksize - 1 to pointer * with wrap around logic to ensure that every * memory location is hit evenly */ ec->memlocation = ec->memlocation + ec->memblocksize - 1; ec->memlocation = ec->memlocation % wrap; } } /*************************************************************************** * Start of entropy processing logic ***************************************************************************/ /* * This is the heart of the entropy generation: calculate time deltas and * use the CPU jitter in the time deltas. The jitter is injected into the * entropy pool. * * WARNING: ensure that ->prev_time is primed before using the output * of this function! This can be done by calling this function * and not using its result. * * @ec [in] Reference to entropy collector * * @return result of stuck test */ static int jent_measure_jitter(struct rand_data *ec, __u64 *ret_current_delta) { __u64 time = 0; __u64 current_delta = 0; int stuck; /* Invoke one noise source before time measurement to add variations */ jent_memaccess(ec, 0); /* * Get time stamp and calculate time delta to previous * invocation to measure the timing variations */ jent_get_nstime(&time); current_delta = jent_delta(ec->prev_time, time); ec->prev_time = time; /* Check whether we have a stuck measurement. */ stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ if (jent_condition_data(ec, current_delta, stuck)) stuck = 1; /* return the raw entropy value */ if (ret_current_delta) *ret_current_delta = current_delta; return stuck; } /* * Generator of one 64 bit random number * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ static void jent_gen_entropy(struct rand_data *ec) { unsigned int k = 0, safety_factor = 0; if (fips_enabled) safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec, NULL); while (!jent_health_failure(ec)) { /* If a stuck measurement is received, repeat measurement */ if (jent_measure_jitter(ec, NULL)) continue; /* * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } /* * Entry function: Obtain entropy for the caller. * * This function invokes the entropy gathering logic as often to generate * as many bytes as requested by the caller. The entropy gathering logic * creates 64 bit per invocation. * * This function truncates the last 64 bit entropy value output to the exact * size specified by the caller. * * @ec [in] Reference to entropy collector * @data [in] pointer to buffer for storing random data -- buffer must already * exist * @len [in] size of the buffer, specifying also the requested number of random * in bytes * * @return 0 when request is fulfilled or an error * * The following error codes can occur: * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len) { unsigned char *p = data; if (!ec) return -1; while (len > 0) { unsigned int tocopy, health_test_result; jent_gen_entropy(ec); health_test_result = jent_health_failure(ec); if (health_test_result > JENT_PERMANENT_FAILURE_SHIFT) { /* * At this point, the Jitter RNG instance is considered * as a failed instance. There is no rerun of the * startup test any more, because the caller * is assumed to not further use this instance. */ return -3; } else if (health_test_result) { /* * Perform startup health tests and return permanent * error if it fails. */ if (jent_entropy_init(0, 0, NULL, ec)) { /* Mark the permanent error */ ec->health_failure &= JENT_RCT_FAILURE_PERMANENT | JENT_APT_FAILURE_PERMANENT; return -3; } return -2; } tocopy = min(DATA_SIZE_BITS / 8, len); if (jent_read_random_block(ec->hash_state, p, tocopy)) return -1; len -= tocopy; p += tocopy; } return 0; } /*************************************************************************** * Initialization logic ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, unsigned int flags, void *hash_state) { struct rand_data *entropy_collector; entropy_collector = jent_zalloc(sizeof(struct rand_data)); if (!entropy_collector) return NULL; if (!(flags & JENT_DISABLE_MEMORY_ACCESS)) { /* Allocate memory for adding variations based on memory * access */ entropy_collector->mem = jent_kvzalloc(JENT_MEMORY_SIZE); if (!entropy_collector->mem) { jent_zfree(entropy_collector); return NULL; } entropy_collector->memblocksize = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE; entropy_collector->memblocks = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS; entropy_collector->memaccessloops = JENT_MEMORY_ACCESSLOOPS; } /* verify and set the oversampling rate */ if (osr == 0) osr = 1; /* H_submitter = 1 / osr */ entropy_collector->osr = osr; entropy_collector->flags = flags; entropy_collector->hash_state = hash_state; /* Initialize the APT */ jent_apt_init(entropy_collector, osr); /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); return entropy_collector; } void jent_entropy_collector_free(struct rand_data *entropy_collector) { jent_kvzfree(entropy_collector->mem, JENT_MEMORY_SIZE); entropy_collector->mem = NULL; jent_zfree(entropy_collector); } int jent_entropy_init(unsigned int osr, unsigned int flags, void *hash_state, struct rand_data *p_ec) { /* * If caller provides an allocated ec, reuse it which implies that the * health test entropy data is used to further still the available * entropy pool. */ struct rand_data *ec = p_ec; int i, time_backwards = 0, ret = 0, ec_free = 0; unsigned int health_test_result; if (!ec) { ec = jent_entropy_collector_alloc(osr, flags, hash_state); if (!ec) return JENT_EMEM; ec_free = 1; } else { /* Reset the APT */ jent_apt_reset(ec, 0); /* Ensure that a new APT base is obtained */ ec->apt_base_set = 0; /* Reset the RCT */ ec->rct_count = 0; /* Reset intermittent, leave permanent health test result */ ec->health_failure &= (~JENT_RCT_FAILURE); ec->health_failure &= (~JENT_APT_FAILURE); } /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These * loop counts may show some slight skew and we produce * false positives. * * Moreover, only old systems show potentially problematic * jitter entropy that could potentially be caught here. But * the RNG is intended for hardware that is available or widely * used, but not old systems that are long out of favor. Thus, * no statistical tests. */ /* * We could add a check for system capabilities such as clock_getres or * check for CONFIG_X86_TSC, but it does not make much sense as the * following sanity checks verify that we have a high-resolution * timer. */ /* * TESTLOOPCOUNT needs some loops to identify edge systems. 100 is * definitely too little. * * SP800-90B requires at least 1024 initial test cycles. */ #define TESTLOOPCOUNT 1024 #define CLEARCACHE 100 for (i = 0; (TESTLOOPCOUNT + CLEARCACHE) > i; i++) { __u64 start_time = 0, end_time = 0, delta = 0; /* Invoke core entropy collection logic */ jent_measure_jitter(ec, &delta); end_time = ec->prev_time; start_time = ec->prev_time - delta; /* test whether timer works */ if (!start_time || !end_time) { ret = JENT_ENOTIME; goto out; } /* * test whether timer is fine grained enough to provide * delta even when called shortly after each other -- this * implies that we also have a high resolution timer */ if (!delta || (end_time == start_time)) { ret = JENT_ECOARSETIME; goto out; } /* * up to here we did not modify any variable that will be * evaluated later, but we already performed some work. Thus we * already have had an impact on the caches, branch prediction, * etc. with the goal to clear it to get the worst case * measurements. */ if (i < CLEARCACHE) continue; /* test whether we have an increasing timer */ if (!(end_time > start_time)) time_backwards++; } /* * we allow up to three times the time running backwards. * CLOCK_REALTIME is affected by adjtime and NTP operations. Thus, * if such an operation just happens to interfere with our test, it * should not fail. The value of 3 should cover the NTP case being * performed during our test run. */ if (time_backwards > 3) { ret = JENT_ENOMONOTONIC; goto out; } /* Did we encounter a health test failure? */ health_test_result = jent_health_failure(ec); if (health_test_result) { ret = (health_test_result & JENT_RCT_FAILURE) ? JENT_ERCT : JENT_EHEALTH; goto out; } out: if (ec_free) jent_entropy_collector_free(ec); return ret; }
22264 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM capability #if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CAPABILITY_H #include <linux/cred.h> #include <linux/tracepoint.h> #include <linux/user_namespace.h> /** * cap_capable - called after it's determined if a task has a particular * effective capability * * @cred: The credentials used * @target_ns: The user namespace of the resource being accessed * @capable_ns: The user namespace in which the credential provides the * capability to access the targeted resource. * This will be NULL if ret is not 0. * @cap: The capability to check for * @ret: The return value of the check: 0 if it does, -ve if it does not * * Allows to trace calls to cap_capable in commoncap.c */ TRACE_EVENT(cap_capable, TP_PROTO(const struct cred *cred, struct user_namespace *target_ns, const struct user_namespace *capable_ns, int cap, int ret), TP_ARGS(cred, target_ns, capable_ns, cap, ret), TP_STRUCT__entry( __field(const struct cred *, cred) __field(struct user_namespace *, target_ns) __field(const struct user_namespace *, capable_ns) __field(int, cap) __field(int, ret) ), TP_fast_assign( __entry->cred = cred; __entry->target_ns = target_ns; __entry->capable_ns = ret == 0 ? capable_ns : NULL; __entry->cap = cap; __entry->ret = ret; ), TP_printk("cred %p, target_ns %p, capable_ns %p, cap %d, ret %d", __entry->cred, __entry->target_ns, __entry->capable_ns, __entry->cap, __entry->ret) ); #endif /* _TRACE_CAPABILITY_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* * include/net/tipc.h: Include file for TIPC message header routines * * Copyright (c) 2017 Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_HDR_H #define _TIPC_HDR_H #include <linux/random.h> #define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */ struct tipc_basic_hdr { __be32 w[4]; }; static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) { u32 w0 = ntohl(hdr->w[0]); bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; __be32 key; /* Return source node identity as key */ if (likely(!keepalive_msg)) return hdr->w[3]; /* Spread PROBE/PROBE_REPLY messages across the cores */ get_random_bytes(&key, sizeof(key)); return key; } #endif
1564 215 170 1 5 4 13 3 2 113 70 1 18 18 71 26 278 111 14 3 3 2 6 1 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #ifndef _BR_PRIVATE_H #define _BR_PRIVATE_H #include <linux/netdevice.h> #include <linux/if_bridge.h> #include <linux/netpoll.h> #include <linux/u64_stats_sync.h> #include <net/route.h> #include <net/ip6_fib.h> #include <net/pkt_cls.h> #include <linux/if_vlan.h> #include <linux/rhashtable.h> #include <linux/refcount.h> #define BR_HASH_BITS 8 #define BR_HASH_SIZE (1 << BR_HASH_BITS) #define BR_HOLD_TIME (1*HZ) #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1<<BR_PORT_BITS) #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000) #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN #define BR_HWDOM_MAX BITS_PER_LONG #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ enum { BR_GROUPFWD_STP = BIT(0), BR_GROUPFWD_MACPAUSE = BIT(1), BR_GROUPFWD_LACP = BIT(2), }; #define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \ BR_GROUPFWD_LACP) /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u /* Path to usermode spanning tree program */ #define BR_STP_PROG "/sbin/bridge-stp" #define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT) typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; struct bridge_id { unsigned char prio[2]; unsigned char addr[ETH_ALEN]; }; struct mac_addr { unsigned char addr[ETH_ALEN]; }; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING /* our own querier */ struct bridge_mcast_own_query { struct timer_list timer; u32 startup_sent; }; /* other querier */ struct bridge_mcast_other_query { struct timer_list timer; struct timer_list delay_timer; }; /* selected querier */ struct bridge_mcast_querier { struct br_ip addr; int port_ifidx; seqcount_spinlock_t seq; }; /* IGMP/MLD statistics */ struct bridge_mcast_stats { struct br_mcast_stats mstats; struct u64_stats_sync syncp; }; struct br_mdb_src_entry { struct br_ip addr; }; struct br_mdb_config { struct net_bridge *br; struct net_bridge_port *p; struct br_mdb_entry *entry; struct br_ip group; bool src_entry; u8 filter_mode; u16 nlflags; struct br_mdb_src_entry *src_entries; int num_src_entries; u8 rt_protocol; }; #endif /* net_bridge_mcast_port must be always defined due to forwarding stubs */ struct net_bridge_mcast_port { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct net_bridge_port *port; struct net_bridge_vlan *vlan; struct bridge_mcast_own_query ip4_own_query; struct timer_list ip4_mc_router_timer; struct hlist_node ip4_rlist; #if IS_ENABLED(CONFIG_IPV6) struct bridge_mcast_own_query ip6_own_query; struct timer_list ip6_mc_router_timer; struct hlist_node ip6_rlist; #endif /* IS_ENABLED(CONFIG_IPV6) */ unsigned char multicast_router; u32 mdb_n_entries; u32 mdb_max_entries; #endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ }; /* net_bridge_mcast must be always defined due to forwarding stubs */ struct net_bridge_mcast { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct net_bridge *br; struct net_bridge_vlan *vlan; u32 multicast_last_member_count; u32 multicast_startup_query_count; u8 multicast_querier; u8 multicast_igmp_version; u8 multicast_router; #if IS_ENABLED(CONFIG_IPV6) u8 multicast_mld_version; #endif unsigned long multicast_last_member_interval; unsigned long multicast_membership_interval; unsigned long multicast_querier_interval; unsigned long multicast_query_interval; unsigned long multicast_query_response_interval; unsigned long multicast_startup_query_interval; struct hlist_head ip4_mc_router_list; struct timer_list ip4_mc_router_timer; struct bridge_mcast_other_query ip4_other_query; struct bridge_mcast_own_query ip4_own_query; struct bridge_mcast_querier ip4_querier; #if IS_ENABLED(CONFIG_IPV6) struct hlist_head ip6_mc_router_list; struct timer_list ip6_mc_router_timer; struct bridge_mcast_other_query ip6_other_query; struct bridge_mcast_own_query ip6_own_query; struct bridge_mcast_querier ip6_querier; #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ }; struct br_tunnel_info { __be64 tunnel_id; struct metadata_dst __rcu *tunnel_dst; }; /* private vlan flags */ enum { BR_VLFLAG_PER_PORT_STATS = BIT(0), BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1), BR_VLFLAG_MCAST_ENABLED = BIT(2), BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3), BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4), }; /** * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member * @tnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags * @priv_flags: private (in-kernel) bridge vlan flags * @state: STP state (e.g. blocking, learning, forwarding) * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct * @refcnt: if MASTER flag set, this is bumped for each port referencing it * @brvlan: if MASTER flag unset, this points to the global per-VLAN context * for this VLAN entry * @tinfo: bridge tunnel info * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast * context * @msti: if MASTER flag set, this holds the VLANs MST instance * @vlist: sorted list of VLAN entries * @rcu: used for entry destruction * * This structure is shared between the global per-VLAN entries contained in * the bridge rhashtable and the local per-port per-VLAN entries contained in * the port's rhashtable. The union entries should be interpreted depending on * the entry flags that are set. */ struct net_bridge_vlan { struct rhash_head vnode; struct rhash_head tnode; u16 vid; u16 flags; u16 priv_flags; u8 state; struct pcpu_sw_netstats __percpu *stats; union { struct net_bridge *br; struct net_bridge_port *port; }; union { refcount_t refcnt; struct net_bridge_vlan *brvlan; }; struct br_tunnel_info tinfo; union { struct net_bridge_mcast br_mcast_ctx; struct net_bridge_mcast_port port_mcast_ctx; }; u16 msti; struct list_head vlist; struct rcu_head rcu; }; /** * struct net_bridge_vlan_group * * @vlan_hash: VLAN entry rhashtable * @vlan_list: sorted VLAN entry list * @num_vlans: number of total VLAN entries * @pvid: PVID VLAN id * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking) * * IMPORTANT: Be careful when checking if there're VLAN entries using list * primitives because the bridge can have entries in its list which * are just for global context but not for filtering, i.e. they have * the master flag set but not the brentry flag. If you have to check * if there're "real" entries in the bridge please test @num_vlans */ struct net_bridge_vlan_group { struct rhashtable vlan_hash; struct rhashtable tunnel_hash; struct list_head vlan_list; u16 num_vlans; u16 pvid; u8 pvid_state; }; /* bridge fdb flags */ enum { BR_FDB_LOCAL, BR_FDB_STATIC, BR_FDB_STICKY, BR_FDB_ADDED_BY_USER, BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, BR_FDB_NOTIFY, BR_FDB_NOTIFY_INACTIVE, BR_FDB_LOCKED, BR_FDB_DYNAMIC_LEARNED, }; struct net_bridge_fdb_key { mac_addr addr; u16 vlan_id; }; struct net_bridge_fdb_entry { struct rhash_head rhnode; struct net_bridge_port *dst; struct net_bridge_fdb_key key; struct hlist_node fdb_node; unsigned long flags; /* write-heavy members should not affect lookups */ unsigned long updated ____cacheline_aligned_in_smp; unsigned long used; struct rcu_head rcu; }; struct net_bridge_fdb_flush_desc { unsigned long flags; unsigned long flags_mask; int port_ifindex; u16 vlan_id; }; #define MDB_PG_FLAGS_PERMANENT BIT(0) #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) #define MDB_PG_FLAGS_STAR_EXCL BIT(3) #define MDB_PG_FLAGS_BLOCKED BIT(4) #define PG_SRC_ENT_LIMIT 32 #define BR_SGRP_F_DELETE BIT(0) #define BR_SGRP_F_SEND BIT(1) #define BR_SGRP_F_INSTALLED BIT(2) #define BR_SGRP_F_USER_ADDED BIT(3) struct net_bridge_mcast_gc { struct hlist_node gc_node; void (*destroy)(struct net_bridge_mcast_gc *gc); }; struct net_bridge_group_src { struct hlist_node node; struct br_ip addr; struct net_bridge_port_group *pg; u8 flags; u8 src_query_rexmit_cnt; struct timer_list timer; struct net_bridge *br; struct net_bridge_mcast_gc mcast_gc; struct rcu_head rcu; }; struct net_bridge_port_group_sg_key { struct net_bridge_port *port; struct br_ip addr; }; struct net_bridge_port_group { struct net_bridge_port_group __rcu *next; struct net_bridge_port_group_sg_key key; unsigned char eth_addr[ETH_ALEN] __aligned(2); unsigned char flags; unsigned char filter_mode; unsigned char grp_query_rexmit_cnt; unsigned char rt_protocol; struct hlist_head src_list; unsigned int src_ents; struct timer_list timer; struct timer_list rexmit_timer; struct hlist_node mglist; struct rb_root eht_set_tree; struct rb_root eht_host_tree; struct rhash_head rhnode; struct net_bridge_mcast_gc mcast_gc; struct rcu_head rcu; }; struct net_bridge_mdb_entry { struct rhash_head rhnode; struct net_bridge *br; struct net_bridge_port_group __rcu *ports; struct br_ip addr; bool host_joined; struct timer_list timer; struct hlist_node mdb_node; struct net_bridge_mcast_gc mcast_gc; struct rcu_head rcu; }; struct net_bridge_port { struct net_bridge *br; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head list; unsigned long flags; #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; #endif struct net_bridge_port __rcu *backup_port; u32 backup_nhid; /* STP */ u8 priority; u8 state; u16 port_no; unsigned char topology_change_ack; unsigned char config_pending; port_id port_id; port_id designated_port; bridge_id designated_root; bridge_id designated_bridge; u32 path_cost; u32 designated_cost; unsigned long designated_age; struct timer_list forward_delay_timer; struct timer_list hold_timer; struct timer_list message_age_timer; struct kobject kobj; struct rcu_head rcu; struct net_bridge_mcast_port multicast_ctx; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_stats __percpu *mcast_stats; u32 multicast_eht_hosts_limit; u32 multicast_eht_hosts_cnt; struct hlist_head mglist; #endif #ifdef CONFIG_SYSFS char sysfs_name[IFNAMSIZ]; #endif #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif #ifdef CONFIG_NET_SWITCHDEV /* Identifier used to group ports that share the same switchdev * hardware domain. */ int hwdom; int offload_count; struct netdev_phys_item_id ppid; #endif u16 group_fwd_mask; u16 backup_redirected_cnt; struct bridge_stp_xstats stp_xstats; }; #define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj) #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) #define br_promisc_port(p) ((p)->flags & BR_PROMISC) static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *dev) { return netif_is_bridge_port(dev) ? rtnl_dereference(dev->rx_handler_data) : NULL; } static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_device *dev) { return netif_is_bridge_port(dev) ? rcu_dereference_rtnl(dev->rx_handler_data) : NULL; } enum net_bridge_opts { BROPT_VLAN_ENABLED, BROPT_VLAN_STATS_ENABLED, BROPT_NF_CALL_IPTABLES, BROPT_NF_CALL_IP6TABLES, BROPT_NF_CALL_ARPTABLES, BROPT_GROUP_ADDR_SET, BROPT_MULTICAST_ENABLED, BROPT_MULTICAST_QUERY_USE_IFADDR, BROPT_MULTICAST_STATS_ENABLED, BROPT_HAS_IPV6_ADDR, BROPT_NEIGH_SUPPRESS_ENABLED, BROPT_MTU_SET_BY_USER, BROPT_VLAN_STATS_PER_PORT, BROPT_NO_LL_LEARN, BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, }; struct net_bridge { spinlock_t lock; spinlock_t hash_lock; struct hlist_head frame_type_list; struct net_device *dev; unsigned long options; /* These fields are accessed on each packet */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING __be16 vlan_proto; u16 default_pvid; struct net_bridge_vlan_group __rcu *vlgrp; #endif struct rhashtable fdb_hash_tbl; struct list_head port_list; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) union { struct rtable fake_rtable; struct rt6_info fake_rt6_info; }; #endif u16 group_fwd_mask; u16 group_fwd_mask_required; /* STP */ bridge_id designated_root; bridge_id bridge_id; unsigned char topology_change; unsigned char topology_change_detected; u16 root_port; unsigned long max_age; unsigned long hello_time; unsigned long forward_delay; unsigned long ageing_time; unsigned long bridge_max_age; unsigned long bridge_hello_time; unsigned long bridge_forward_delay; unsigned long bridge_ageing_time; u32 root_path_cost; u8 group_addr[ETH_ALEN]; enum { BR_NO_STP, /* no spanning tree */ BR_KERNEL_STP, /* old STP in kernel */ BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; struct net_bridge_mcast multicast_ctx; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_stats __percpu *mcast_stats; u32 hash_max; spinlock_t multicast_lock; struct rhashtable mdb_hash_tbl; struct rhashtable sg_port_tbl; struct hlist_head mcast_gc_list; struct hlist_head mdb_list; struct work_struct mcast_gc_work; #endif struct timer_list hello_timer; struct timer_list tcn_timer; struct timer_list topology_change_timer; struct delayed_work gc_work; struct kobject *ifobj; u32 auto_cnt; atomic_t fdb_n_learned; u32 fdb_max_learned; #ifdef CONFIG_NET_SWITCHDEV /* Counter used to make sure that hardware domains get unique * identifiers in case a bridge spans multiple switchdev instances. */ int last_hwdom; /* Bit mask of hardware domain numbers in use */ unsigned long busy_hwdoms; #endif struct hlist_head fdb_list; #if IS_ENABLED(CONFIG_BRIDGE_MRP) struct hlist_head mrp_list; #endif #if IS_ENABLED(CONFIG_BRIDGE_CFM) struct hlist_head mep_list; #endif }; struct br_input_skb_cb { struct net_device *brdev; u16 frag_max_size; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u8 igmp; u8 mrouters_only:1; #endif u8 proxyarp_replied:1; u8 src_port_isolated:1; u8 promisc:1; #ifdef CONFIG_BRIDGE_VLAN_FILTERING u8 vlan_filtered:1; #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE u8 br_netfilter_broute:1; #endif #ifdef CONFIG_NET_SWITCHDEV /* Set if TX data plane offloading is used towards at least one * hardware domain. */ u8 tx_fwd_offload:1; /* The switchdev hardware domain from which this packet was received. * If skb->offload_fwd_mark was set, then this packet was already * forwarded by hardware to the other ports in the source hardware * domain, otherwise it wasn't. */ int src_hwdom; /* Bit mask of hardware domains towards this packet has already been * transmitted using the TX data plane offload. */ unsigned long fwd_hwdoms; #endif u32 backup_nhid; }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (BR_INPUT_SKB_CB(__skb)->mrouters_only) #else # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (0) #endif #define br_printk(level, br, format, args...) \ printk(level "%s: " format, (br)->dev->name, ##args) #define br_err(__br, format, args...) \ br_printk(KERN_ERR, __br, format, ##args) #define br_warn(__br, format, args...) \ br_printk(KERN_WARNING, __br, format, ##args) #define br_notice(__br, format, args...) \ br_printk(KERN_NOTICE, __br, format, ##args) #define br_info(__br, format, args...) \ br_printk(KERN_INFO, __br, format, ##args) #define br_debug(br, format, args...) \ pr_debug("%s: " format, (br)->dev->name, ##args) /* called under bridge lock */ static inline int br_is_root_bridge(const struct net_bridge *br) { return !memcmp(&br->bridge_id, &br->designated_root, 8); } /* check if a VLAN entry is global */ static inline bool br_vlan_is_master(const struct net_bridge_vlan *v) { return v->flags & BRIDGE_VLAN_INFO_MASTER; } /* check if a VLAN entry is used by the bridge */ static inline bool br_vlan_is_brentry(const struct net_bridge_vlan *v) { return v->flags & BRIDGE_VLAN_INFO_BRENTRY; } /* check if we should use the vlan entry, returns false if it's only context */ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v) { if (br_vlan_is_master(v)) { if (br_vlan_is_brentry(v)) return true; else return false; } return true; } static inline bool nbp_state_should_learn(const struct net_bridge_port *p) { return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING; } static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack) { bool ret = vid > 0 && vid < VLAN_VID_MASK; if (!ret) NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid"); return ret; } static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, const struct bridge_vlan_info *last, struct netlink_ext_ack *extack) { /* pvid flag is not allowed in ranges */ if (cur->flags & BRIDGE_VLAN_INFO_PVID) { NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range"); return false; } /* when cur is the range end, check if: * - it has range start flag * - range ids are invalid (end is equal to or before start) */ if (last) { if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) { NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one"); return false; } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) { NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing"); return false; } else if (cur->vid <= last->vid) { NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); return false; } } /* check for required range flags */ if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | BRIDGE_VLAN_INFO_RANGE_END))) { NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing"); return false; } return true; } static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v) { u8 mcast_router = MDB_RTR_TYPE_DISABLED; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (!br_vlan_is_master(v)) mcast_router = v->port_mcast_ctx.multicast_router; else mcast_router = v->br_mcast_ctx.multicast_router; #endif return mcast_router; } static inline int br_afspec_cmd_to_rtm(int cmd) { switch (cmd) { case RTM_SETLINK: return RTM_NEWVLAN; case RTM_DELLINK: return RTM_DELVLAN; } return 0; } static inline int br_opt_get(const struct net_bridge *br, enum net_bridge_opts opt) { return test_bit(opt, &br->options); } int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack); int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt); int br_boolopt_multi_toggle(struct net_bridge *br, struct br_boolopt_multi *bm, struct netlink_ext_ack *extack); void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm); void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) { struct tc_skb_ext *ext; if (!tc_skb_ext_tc_enabled()) return; ext = skb_ext_find(skb, TC_SKB_EXT); if (ext) { ext->l2_miss = miss; return; } if (!miss) return; ext = tc_skb_ext_alloc(skb); if (!ext) return; ext->l2_miss = true; } #else static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) { } #endif /* br_device.c */ void br_dev_setup(struct net_device *dev); void br_dev_delete(struct net_device *dev, struct list_head *list); netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev); #ifdef CONFIG_NET_POLL_CONTROLLER static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { netpoll_send_skb(p->np, skb); } int br_netpoll_enable(struct net_bridge_port *p); void br_netpoll_disable(struct net_bridge_port *p); #else static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline int br_netpoll_enable(struct net_bridge_port *p) { return 0; } static inline void br_netpoll_disable(struct net_bridge_port *p) { } #endif /* br_fdb.c */ #define FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) #define FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) #define FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_USE | NTF_EXT_LEARNED | \ NTF_STICKY | NTF_OFFLOADED) int br_fdb_init(void); void br_fdb_fini(void); int br_fdb_hash_init(struct net_bridge *br); void br_fdb_hash_fini(struct net_bridge *br); void br_fdb_flush(struct net_bridge *br, const struct net_bridge_fdb_flush_desc *desc); void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid); void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid); int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, unsigned long off); int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack); int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, bool *notified, struct netlink_ext_ack *extack); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *fdev, int *idx); int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool locked, bool swdev_notify); int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify); void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool offloaded); /* br_forward.c */ enum br_pkt_type { BR_PKT_UNICAST, BR_PKT_MULTICAST, BR_PKT_BROADCAST }; int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_rcv, bool local_orig); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig, u16 vid); /* return true if both source port and dest port are isolated */ static inline bool br_skb_isolated(const struct net_bridge_port *to, const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->src_port_isolated && (to->flags & BR_ISOLATED); } /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); void br_mtu_auto_adjust(struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); void br_manage_promisc(struct net_bridge *br); int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); /* br_input.c */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); rx_handler_func_t *br_get_rx_handler(const struct net_device *dev); struct br_frame_type { __be16 type; int (*frame_handler)(struct net_bridge_port *port, struct sk_buff *skb); struct hlist_node list; }; void br_add_frame(struct net_bridge *br, struct br_frame_type *ft); void br_del_frame(struct net_bridge *br, struct br_frame_type *ft); static inline bool br_rx_handler_check_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev); } static inline bool br_rx_handler_check_rtnl(const struct net_device *dev) { return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev); } static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev) { return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL; } static inline struct net_bridge_port * br_port_get_check_rtnl(const struct net_device *dev) { return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL; } /* br_ioctl.c */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd); int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid); struct net_bridge_mdb_entry * br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid); int br_multicast_add_port(struct net_bridge_port *port); void br_multicast_del_port(struct net_bridge_port *port); void br_multicast_enable_port(struct net_bridge_port *port); void br_multicast_disable_port(struct net_bridge_port *port); void br_multicast_init(struct net_bridge *br); void br_multicast_join_snoopers(struct net_bridge *br); void br_multicast_leave_snoopers(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig); int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val); int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, unsigned long val); int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router); int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val); int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, unsigned long val); #if IS_ENABLED(CONFIG_IPV6) int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, unsigned long val); #endif struct net_bridge_mdb_entry * br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); struct net_bridge_mdb_entry * br_multicast_new_group(struct net_bridge *br, struct br_ip *group); struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, u8 filter_mode, u8 rt_protocol, struct netlink_ext_ack *extack); void br_multicast_del_port_group(struct net_bridge_port_group *p); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp); void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir); int br_multicast_init_stats(struct net_bridge *br); void br_multicast_uninit_stats(struct net_bridge *br); void br_multicast_get_stats(const struct net_bridge *br, const struct net_bridge_port *p, struct br_mcast_stats *dest); u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx); void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max); u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx); int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); void br_multicast_host_join(const struct net_bridge_mcast *brmctx, struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode); void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg); struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip); void __br_multicast_del_group_src(struct net_bridge_group_src *src); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); void br_multicast_ctx_init(struct net_bridge *br, struct net_bridge_vlan *vlan, struct net_bridge_mcast *brmctx); void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx); void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx); void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on); int br_rports_fill_info(struct sk_buff *skb, const struct net_bridge_mcast *brmctx); int br_multicast_dump_querier_state(struct sk_buff *skb, const struct net_bridge_mcast *brmctx, int nest_attr); size_t br_multicast_querier_state_size(void); size_t br_rports_size(const struct net_bridge_mcast *brmctx); void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val); void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, unsigned long val); static inline bool br_group_is_l2(const struct br_ip *group) { return group->proto == 0; } #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) static inline struct hlist_node * br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list)); #endif return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list)); } static inline struct net_bridge_port * br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { struct net_bridge_mcast_port *mctx; #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, ip6_rlist); else #endif mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, ip4_rlist); if (mctx) return mctx->port; else return NULL; } static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx) { return timer_pending(&brmctx->ip4_mc_router_timer); } static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx) { #if IS_ENABLED(CONFIG_IPV6) return timer_pending(&brmctx->ip6_mc_router_timer); #else return false; #endif } static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { switch (brmctx->multicast_router) { case MDB_RTR_TYPE_PERM: return true; case MDB_RTR_TYPE_TEMP_QUERY: if (skb) { if (skb->protocol == htons(ETH_P_IP)) return br_ip4_multicast_is_router(brmctx); else if (skb->protocol == htons(ETH_P_IPV6)) return br_ip6_multicast_is_router(brmctx); } else { return br_ip4_multicast_is_router(brmctx) || br_ip6_multicast_is_router(brmctx); } fallthrough; default: return false; } } static inline bool __br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *querier, const bool is_ipv6) { bool own_querier_enabled; if (brmctx->multicast_querier) { if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR)) own_querier_enabled = false; else own_querier_enabled = true; } else { own_querier_enabled = false; } return !timer_pending(&querier->delay_timer) && (own_querier_enabled || timer_pending(&querier->timer)); } static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { switch (eth->h_proto) { case (htons(ETH_P_IP)): return __br_multicast_querier_exists(brmctx, &brmctx->ip4_other_query, false); #if IS_ENABLED(CONFIG_IPV6) case (htons(ETH_P_IPV6)): return __br_multicast_querier_exists(brmctx, &brmctx->ip6_other_query, true); #endif default: return !!mdb && br_group_is_l2(&mdb->addr); } } static inline bool br_multicast_is_star_g(const struct br_ip *ip) { switch (ip->proto) { case htons(ETH_P_IP): return ipv4_is_zeronet(ip->src.ip4); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): return ipv6_addr_any(&ip->src.ip6); #endif default: return false; } } static inline bool br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx, __be16 proto) { switch (proto) { case htons(ETH_P_IP): return !!(brmctx->multicast_igmp_version == 3); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): return !!(brmctx->multicast_mld_version == 2); #endif default: return false; } } static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; } static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx) { return brmctx->multicast_last_member_interval * brmctx->multicast_last_member_count; } static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx) { return brmctx->multicast_membership_interval; } static inline bool br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx) { return !!brmctx->vlan; } static inline bool br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx) { return !!pmctx->vlan; } static inline struct net_bridge_mcast * br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx) { if (!br_multicast_port_ctx_is_vlan(pmctx)) return &pmctx->port->br->multicast_ctx; else return &pmctx->vlan->brvlan->br_mcast_ctx; } static inline bool br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx) { return br_multicast_ctx_is_vlan(brmctx) && (!br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) || !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)); } static inline bool br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx) { return br_multicast_ctx_is_vlan(brmctx) && !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); } static inline bool br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx) { return br_multicast_port_ctx_is_vlan(pmctx) && !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); } static inline bool br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx) { return pmctx->port->state == BR_STATE_DISABLED || (br_multicast_port_ctx_is_vlan(pmctx) && (br_multicast_port_ctx_vlan_disabled(pmctx) || pmctx->vlan->state == BR_STATE_DISABLED)); } static inline bool br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx) { return br_multicast_port_ctx_state_disabled(pmctx) || pmctx->port->state == BR_STATE_BLOCKING || (br_multicast_port_ctx_is_vlan(pmctx) && pmctx->vlan->state == BR_STATE_BLOCKING); } static inline bool br_rports_have_mc_router(const struct net_bridge_mcast *brmctx) { #if IS_ENABLED(CONFIG_IPV6) return !hlist_empty(&brmctx->ip4_mc_router_list) || !hlist_empty(&brmctx->ip6_mc_router_list); #else return !hlist_empty(&brmctx->ip4_mc_router_list); #endif } static inline bool br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, const struct net_bridge_mcast *brmctx2) { return brmctx1->multicast_igmp_version == brmctx2->multicast_igmp_version && brmctx1->multicast_last_member_count == brmctx2->multicast_last_member_count && brmctx1->multicast_startup_query_count == brmctx2->multicast_startup_query_count && brmctx1->multicast_last_member_interval == brmctx2->multicast_last_member_interval && brmctx1->multicast_membership_interval == brmctx2->multicast_membership_interval && brmctx1->multicast_querier_interval == brmctx2->multicast_querier_interval && brmctx1->multicast_query_interval == brmctx2->multicast_query_interval && brmctx1->multicast_query_response_interval == brmctx2->multicast_query_response_interval && brmctx1->multicast_startup_query_interval == brmctx2->multicast_startup_query_interval && brmctx1->multicast_querier == brmctx2->multicast_querier && brmctx1->multicast_router == brmctx2->multicast_router && !br_rports_have_mc_router(brmctx1) && !br_rports_have_mc_router(brmctx2) && #if IS_ENABLED(CONFIG_IPV6) brmctx1->multicast_mld_version == brmctx2->multicast_mld_version && #endif true; } static inline bool br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) { bool vlan_snooping_enabled; vlan_snooping_enabled = !!br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { return 0; } static inline struct net_bridge_mdb_entry * br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { return NULL; } static inline int br_multicast_add_port(struct net_bridge_port *port) { return 0; } static inline void br_multicast_del_port(struct net_bridge_port *port) { } static inline void br_multicast_enable_port(struct net_bridge_port *port) { } static inline void br_multicast_disable_port(struct net_bridge_port *port) { } static inline void br_multicast_init(struct net_bridge *br) { } static inline void br_multicast_join_snoopers(struct net_bridge *br) { } static inline void br_multicast_leave_snoopers(struct net_bridge *br) { } static inline void br_multicast_open(struct net_bridge *br) { } static inline void br_multicast_stop(struct net_bridge *br) { } static inline void br_multicast_dev_del(struct net_bridge *br) { } static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { } static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { return false; } static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { return false; } static inline int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_del(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_dump(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { return 0; } static inline int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mdb_hash_init(struct net_bridge *br) { return 0; } static inline void br_mdb_hash_fini(struct net_bridge *br) { } static inline void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { } static inline int br_multicast_init_stats(struct net_bridge *br) { return 0; } static inline void br_multicast_uninit_stats(struct net_bridge *br) { } static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return 0; } static inline void br_multicast_ctx_init(struct net_bridge *br, struct net_bridge_vlan *vlan, struct net_bridge_mcast *brmctx) { } static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) { } static inline void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx) { } static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { } static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { } static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) { return false; } static inline bool br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, const struct net_bridge_mcast *brmctx2) { return true; } #endif /* br_vlan.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *port, struct net_bridge_vlan_group *vg, struct sk_buff *skb); int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack); int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, struct netlink_ext_ack *extack); int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack); int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack); int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats); void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); void br_vlan_vlan_upper_event(struct net_device *br_dev, struct net_device *vlan_dev, unsigned long event); int br_vlan_rtnl_init(void); void br_vlan_rtnl_uninit(void); void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd); bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path); int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { return rtnl_dereference(br->vlgrp); } static inline struct net_bridge_vlan_group *nbp_vlan_group( const struct net_bridge_port *p) { return rtnl_dereference(p->vlgrp); } static inline struct net_bridge_vlan_group *br_vlan_group_rcu( const struct net_bridge *br) { return rcu_dereference(br->vlgrp); } static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( const struct net_bridge_port *p) { return rcu_dereference(p->vlgrp); } /* Since bridge now depends on 8021Q module, but the time bridge sees the * skb, the vlan tag will always be present if the frame was tagged. */ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid) { int err = 0; if (skb_vlan_tag_present(skb)) { *vid = skb_vlan_tag_get_id(skb); } else { *vid = 0; err = -EINVAL; } return err; } static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { if (!vg) return 0; smp_rmb(); return vg->pvid; } static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) { return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags; } #else static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { *vlan = NULL; return true; } static inline bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { return true; } static inline bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { return true; } static inline struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *port, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { return skb; } static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; } static inline int br_vlan_delete(struct net_bridge *br, u16 vid) { return -EOPNOTSUPP; } static inline void br_vlan_flush(struct net_bridge *br) { } static inline void br_recalculate_fwd_mask(struct net_bridge *br) { } static inline int br_vlan_init(struct net_bridge *br) { return 0; } static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { *changed = false; return -EOPNOTSUPP; } static inline int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { return -EOPNOTSUPP; } static inline void nbp_vlan_flush(struct net_bridge_port *port) { } static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { return NULL; } static inline int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack) { return 0; } static inline u16 br_vlan_get_tag(const struct sk_buff *skb, u16 *tag) { return 0; } static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg) { return 0; } static inline int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask) { return 0; } static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path) { } static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path) { return 0; } static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { return NULL; } static inline struct net_bridge_vlan_group *nbp_vlan_group( const struct net_bridge_port *p) { return NULL; } static inline struct net_bridge_vlan_group *br_vlan_group_rcu( const struct net_bridge *br) { return NULL; } static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( const struct net_bridge_port *p) { return NULL; } static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats) { } static inline void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { } static inline int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { return 0; } static inline void br_vlan_vlan_upper_event(struct net_device *br_dev, struct net_device *vlan_dev, unsigned long event) { } static inline int br_vlan_rtnl_init(void) { return 0; } static inline void br_vlan_rtnl_uninit(void) { } static inline void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd) { } static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return true; } static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) { return 0; } #endif /* br_vlan_options.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, const struct net_bridge_port *p); size_t br_vlan_opts_nl_size(void); int br_vlan_process_options(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan *range_start, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack); int br_vlan_rtm_process_global_options(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *r_end); bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts); /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) { return READ_ONCE(v->state); } static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) { WRITE_ONCE(v->state, state); } static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) { return READ_ONCE(vg->pvid_state); } static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg, u8 state) { WRITE_ONCE(vg->pvid_state, state); } /* learn_allow is true at ingress and false at egress */ static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) { switch (state) { case BR_STATE_LEARNING: return learn_allow; case BR_STATE_FORWARDING: return true; default: return false; } } #endif /* br_mst.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING DECLARE_STATIC_KEY_FALSE(br_mst_used); static inline bool br_mst_is_enabled(struct net_bridge *br) { return static_branch_unlikely(&br_mst_used) && br_opt_get(br, BROPT_MST_ENABLED); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, struct netlink_ext_ack *extack); int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti); void br_mst_vlan_init_state(struct net_bridge_vlan *v); int br_mst_set_enabled(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); size_t br_mst_info_size(const struct net_bridge_vlan_group *vg); int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg); int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack); #else static inline bool br_mst_is_enabled(struct net_bridge *br) { return false; } static inline int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_mst_set_enabled(struct net_bridge *br, bool on, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline size_t br_mst_info_size(const struct net_bridge_vlan_group *vg) { return 0; } static inline int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg) { return -EOPNOTSUPP; } static inline int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } #endif struct nf_br_ops { int (*br_dev_xmit_hook)(struct sk_buff *skb); }; extern const struct nf_br_ops __rcu *nf_br_ops; /* br_netfilter.c */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int br_nf_core_init(void); void br_nf_core_fini(void); void br_netfilter_rtable_init(struct net_bridge *); #else static inline int br_nf_core_init(void) { return 0; } static inline void br_nf_core_fini(void) {} #define br_netfilter_rtable_init(x) #endif /* br_stp.c */ void br_set_state(struct net_bridge_port *p, unsigned int state); struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no); void br_init_port(struct net_bridge_port *p); void br_become_designated_port(struct net_bridge_port *p); void __br_set_forward_delay(struct net_bridge *br, unsigned long t); int br_set_forward_delay(struct net_bridge *br, unsigned long x); int br_set_hello_time(struct net_bridge *br, unsigned long x); int br_set_max_age(struct net_bridge *br, unsigned long x); int __set_ageing_time(struct net_device *dev, unsigned long t); int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ void br_stp_enable_bridge(struct net_bridge *br); void br_stp_disable_bridge(struct net_bridge *br); int br_stp_set_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); void br_stp_enable_port(struct net_bridge_port *p); void br_stp_disable_port(struct net_bridge_port *p); bool br_stp_recalculate_bridge_id(struct net_bridge *br); void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio); int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost); ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id); /* br_stp_bpdu.c */ struct stp_proto; void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, struct net_device *dev); /* br_stp_timer.c */ void br_stp_timer_init(struct net_bridge *br); void br_stp_port_timer_init(struct net_bridge_port *p); unsigned long br_timer_value(const struct timer_list *timer); /* br.c */ #if IS_ENABLED(CONFIG_ATM_LANE) extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr); #endif /* br_mrp.c */ #if IS_ENABLED(CONFIG_BRIDGE_MRP) int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); bool br_mrp_enabled(struct net_bridge *br); void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br); #else static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_mrp_enabled(struct net_bridge *br) { return false; } static inline void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) { } static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br) { return 0; } #endif /* br_cfm.c */ #if IS_ENABLED(CONFIG_BRIDGE_CFM) int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); bool br_cfm_created(struct net_bridge *br); void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p); int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br); int br_cfm_status_fill_info(struct sk_buff *skb, struct net_bridge *br, bool getlink); int br_cfm_mep_count(struct net_bridge *br, u32 *count); int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count); #else static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p, struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_cfm_created(struct net_bridge *br) { return false; } static inline void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p) { } static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br) { return -EOPNOTSUPP; } static inline int br_cfm_status_fill_info(struct sk_buff *skb, struct net_bridge *br, bool getlink) { return -EOPNOTSUPP; } static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count) { *count = 0; return -EOPNOTSUPP; } static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) { *count = 0; return -EOPNOTSUPP; } #endif /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); void br_netlink_fini(void); void br_ifinfo_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port); void br_info_notify(int event, const struct net_bridge *br, const struct net_bridge_port *port, u32 filter); int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags, struct netlink_ext_ack *extack); int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags); int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); int br_process_vlan_info(struct net_bridge *br, struct net_bridge_port *p, int cmd, struct bridge_vlan_info *vinfo_curr, struct bridge_vlan_info **vinfo_last, bool *changed, struct netlink_ext_ack *extack); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ extern const struct sysfs_ops brport_sysfs_ops; int br_sysfs_addif(struct net_bridge_port *p); int br_sysfs_renameif(struct net_bridge_port *p); /* br_sysfs_br.c */ int br_sysfs_addbr(struct net_device *dev); void br_sysfs_delbr(struct net_device *dev); #else static inline int br_sysfs_addif(struct net_bridge_port *p) { return 0; } static inline int br_sysfs_renameif(struct net_bridge_port *p) { return 0; } static inline int br_sysfs_addbr(struct net_device *dev) { return 0; } static inline void br_sysfs_delbr(struct net_device *dev) { return; } #endif /* CONFIG_SYSFS */ /* br_switchdev.c */ #ifdef CONFIG_NET_SWITCHDEV int br_switchdev_port_offload(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack); void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); int br_switchdev_port_replay(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack); bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, struct sk_buff *skb); void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, struct sk_buff *skb); void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb); bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb); int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack); void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type); void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, bool changed, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); void br_switchdev_init(struct net_bridge *br); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { skb->offload_fwd_mark = 0; } #else static inline int br_switchdev_port_offload(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { } static inline int br_switchdev_port_replay(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { return false; } static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) { } static inline void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { } static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { return true; } static inline int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack) { return 0; } static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, bool changed, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) { return -EOPNOTSUPP; } static inline void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type) { } static inline void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { } static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { } static inline void br_switchdev_init(struct net_bridge *br) { } #endif /* CONFIG_NET_SWITCHDEV */ /* br_arp_nd_proxy.c */ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br); void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p); void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p, struct nd_msg *msg); struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m); bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid); #endif
2 2 7 7 13 2 5 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/swap.c * * This file provides functions for reading the suspend image from * and writing it to a swap partition. * * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> */ #define pr_fmt(fmt) "PM: " fmt #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/cpumask.h> #include <linux/atomic.h> #include <linux/kthread.h> #include <linux/crc32.h> #include <linux/ktime.h> #include "power.h" #define HIBERNATE_SIG "S1SUSPEND" u32 swsusp_hardware_signature; /* * When reading an {un,}compressed image, we may restore pages in place, * in which case some architectures need these pages cleaning before they * can be executed. We don't know which pages these may be, so clean the lot. */ static bool clean_pages_on_read; static bool clean_pages_on_decompress; /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. * These structures are stored on the swap and linked together with the * help of the .next_swap member. * * The swap map is created during suspend. The swap map pages are * allocated and populated one at a time, so we only need one memory * page to set up the entire structure. * * During resume we pick up all swap_map_page structures into a list. */ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) /* * Number of free pages that are not high. */ static inline unsigned long low_free_pages(void) { return nr_free_pages() - nr_free_highpages(); } /* * Number of pages required to be kept free while writing the image. Always * half of all available low pages before the writing starts. */ static inline unsigned long reqd_free_pages(void) { return low_free_pages() / 2; } struct swap_map_page { sector_t entries[MAP_PAGE_ENTRIES]; sector_t next_swap; }; struct swap_map_page_list { struct swap_map_page *map; struct swap_map_page_list *next; }; /* * The swap_map_handle structure is used for handling swap in * a file-alike way */ struct swap_map_handle { struct swap_map_page *cur; struct swap_map_page_list *maps; sector_t cur_swap; sector_t first_sector; unsigned int k; unsigned long reqd_free_pages; u32 crc32; }; struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - sizeof(u32) - sizeof(u32)]; u32 hw_sig; u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; } __packed; static struct swsusp_header *swsusp_header; /* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ struct swsusp_extent { struct rb_node node; unsigned long start; unsigned long end; }; static struct rb_root swsusp_extents = RB_ROOT; static int swsusp_extents_insert(unsigned long swap_offset) { struct rb_node **new = &(swsusp_extents.rb_node); struct rb_node *parent = NULL; struct swsusp_extent *ext; /* Figure out where to put the new node */ while (*new) { ext = rb_entry(*new, struct swsusp_extent, node); parent = *new; if (swap_offset < ext->start) { /* Try to merge */ if (swap_offset == ext->start - 1) { ext->start--; return 0; } new = &((*new)->rb_left); } else if (swap_offset > ext->end) { /* Try to merge */ if (swap_offset == ext->end + 1) { ext->end++; return 0; } new = &((*new)->rb_right); } else { /* It already is in the tree */ return -EINVAL; } } /* Add the new node and rebalance the tree. */ ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); if (!ext) return -ENOMEM; ext->start = swap_offset; ext->end = swap_offset; rb_link_node(&ext->node, parent, new); rb_insert_color(&ext->node, &swsusp_extents); return 0; } /* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ sector_t alloc_swapdev_block(int swap) { unsigned long offset; offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) swap_free(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } return 0; } /* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. */ void free_all_swap_pages(int swap) { struct rb_node *node; while ((node = swsusp_extents.rb_node)) { struct swsusp_extent *ext; ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); swap_free_nr(swp_entry(swap, ext->start), ext->end - ext->start + 1); kfree(ext); } } int swsusp_swap_in_use(void) { return (swsusp_extents.rb_node != NULL); } /* * General things */ static unsigned short root_swap = 0xffff; static struct file *hib_resume_bdev_file; struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; blk_status_t error; struct blk_plug plug; }; static void hib_init_batch(struct hib_bio_batch *hb) { atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); hb->error = BLK_STS_OK; blk_start_plug(&hb->plug); } static void hib_finish_batch(struct hib_bio_batch *hb) { blk_finish_plug(&hb->plug); } static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; struct page *page = bio_first_page_all(bio); if (bio->bi_status) { pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } if (bio_data_dir(bio) == WRITE) put_page(page); else if (clean_pages_on_read) flush_icache_range((unsigned long)page_address(page), (unsigned long)page_address(page) + PAGE_SIZE); if (bio->bi_status && !hb->error) hb->error = bio->bi_status; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); bio_put(bio); } static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { struct page *page = virt_to_page(addr); struct bio *bio; int error = 0; bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { pr_err("Adding page to bio failed at %llu\n", (unsigned long long)bio->bi_iter.bi_sector); bio_put(bio); return -EFAULT; } if (hb) { bio->bi_end_io = hib_end_io; bio->bi_private = hb; atomic_inc(&hb->count); submit_bio(bio); } else { error = submit_bio_wait(bio); bio_put(bio); } return error; } static int hib_wait_io(struct hib_bio_batch *hb) { /* * We are relying on the behavior of blk_plug that a thread with * a plug will flush the plug list before sleeping. */ wait_event(hb->wait, atomic_read(&hb->count) == 0); return blk_status_to_errno(hb->error); } /* * Saving part */ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; if (swsusp_hardware_signature) { swsusp_header->hw_sig = swsusp_hardware_signature; flags |= SF_HW_SIG; } swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { pr_err("Swap header not found!\n"); error = -ENODEV; } return error; } /* * Hold the swsusp_header flag. This is used in software_resume() in * 'kernel/power/hibernate' to check if the image is compressed and query * for the compression algorithm support(if so). */ unsigned int swsusp_header_flags; /** * swsusp_swap_check - check if the resume device is a swap device * and get its index (if so) * * This is called before saving image */ static int swsusp_swap_check(void) { int res; if (swsusp_resume_device) res = swap_type_of(swsusp_resume_device, swsusp_resume_block); else res = find_first_swap(&swsusp_resume_device); if (res < 0) return res; root_swap = res; hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(hib_resume_bdev_file)) return PTR_ERR(hib_resume_bdev_file); return 0; } /** * write_page - Write one page to given swap location. * @buf: Address we're writing. * @offset: Offset of the swap page we're writing to. * @hb: bio completion batch */ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { void *src; int ret; if (!offset) return -ENOSPC; if (hb) { src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); } else { WARN_ON_ONCE(1); hb = NULL; /* Go synchronous */ src = buf; } } } else { src = buf; } return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) { if (handle->cur) free_page((unsigned long)handle->cur); handle->cur = NULL; } static int get_swap_writer(struct swap_map_handle *handle) { int ret; ret = swsusp_swap_check(); if (ret) { if (ret != -ENOSPC) pr_err("Cannot find swap device, try swapon -a\n"); return ret; } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); if (!handle->cur) { ret = -ENOMEM; goto err_close; } handle->cur_swap = alloc_swapdev_block(root_swap); if (!handle->cur_swap) { ret = -ENOSPC; goto err_rel; } handle->k = 0; handle->reqd_free_pages = reqd_free_pages(); handle->first_sector = handle->cur_swap; return 0; err_rel: release_swap_writer(handle); err_close: swsusp_close(); return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { int error; sector_t offset; if (!handle->cur) return -EINVAL; offset = alloc_swapdev_block(root_swap); error = write_page(buf, offset, hb); if (error) return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; error = write_page(handle->cur, handle->cur_swap, hb); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; if (hb && low_free_pages() <= handle->reqd_free_pages) { error = hib_wait_io(hb); if (error) goto out; /* * Recalculate the number of required free pages, to * make sure we never take more than half. */ handle->reqd_free_pages = reqd_free_pages(); } } out: return error; } static int flush_swap_writer(struct swap_map_handle *handle) { if (handle->cur && handle->cur_swap) return write_page(handle->cur, handle->cur_swap, NULL); else return -EINVAL; } static int swap_writer_finish(struct swap_map_handle *handle, unsigned int flags, int error) { if (!error) { pr_info("S"); error = mark_swapfiles(handle, flags); pr_cont("|\n"); flush_swap_writer(handle); } if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); swsusp_close(); return error; } /* * Bytes we need for compressed data in worst case. We assume(limitation) * this is the worst of all the compression algorithms. */ #define bytes_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2) /* We need to remember how much compressed data we need to read. */ #define CMP_HEADER sizeof(size_t) /* Number of pages/bytes we'll compress at one time. */ #define UNC_PAGES 32 #define UNC_SIZE (UNC_PAGES * PAGE_SIZE) /* Number of pages we need for compressed data (worst case). */ #define CMP_PAGES DIV_ROUND_UP(bytes_worst_compress(UNC_SIZE) + \ CMP_HEADER, PAGE_SIZE) #define CMP_SIZE (CMP_PAGES * PAGE_SIZE) /* Maximum number of threads for compression/decompression. */ #define CMP_THREADS 3 /* Minimum/maximum number of pages for read buffering. */ #define CMP_MIN_RD_PAGES 1024 #define CMP_MAX_RD_PAGES 8192 /** * save_image - save the suspend image data */ static int save_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) { unsigned int m; int ret; int nr_pages; int err2; struct hib_bio_batch hb; ktime_t start; ktime_t stop; hib_init_batch(&hb); pr_info("Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) break; ret = swap_write_page(handle, data_of(*snapshot), &hb); if (ret) break; if (!(nr_pages % m)) pr_info("Image saving progress: %3d%%\n", nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; if (!ret) pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } /* * Structure used for CRC32. */ struct crc_data { struct task_struct *thr; /* thread */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ unsigned run_threads; /* nr current threads */ wait_queue_head_t go; /* start crc update */ wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ size_t *unc_len[CMP_THREADS]; /* uncompressed lengths */ unsigned char *unc[CMP_THREADS]; /* uncompressed data */ }; /* * CRC32 update function that runs in its own thread. */ static int crc32_threadfn(void *data) { struct crc_data *d = data; unsigned i; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); for (i = 0; i < d->run_threads; i++) *d->crc32 = crc32_le(*d->crc32, d->unc[i], *d->unc_len[i]); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } /* * Structure used for data compression. */ struct cmp_data { struct task_struct *thr; /* thread */ struct crypto_comp *cc; /* crypto compressor stream */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ wait_queue_head_t go; /* start compression */ wait_queue_head_t done; /* compression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; /* Indicates the image size after compression */ static atomic_t compressed_size = ATOMIC_INIT(0); /* * Compression function that runs in its own thread. */ static int compress_threadfn(void *data) { struct cmp_data *d = data; unsigned int cmp_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); cmp_len = CMP_SIZE - CMP_HEADER; d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len, d->cmp + CMP_HEADER, &cmp_len); d->cmp_len = cmp_len; atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } /** * save_compressed_image - Save the suspend image data after compression. * @handle: Swap map handle to use for saving the image. * @snapshot: Image to read data from. * @nr_to_write: Number of pages to save. */ static int save_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_write) { unsigned int m; int ret = 0; int nr_pages; int err2; struct hib_bio_batch hb; ktime_t start; ktime_t stop; size_t off; unsigned thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); atomic_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } /* * Start the compression threads. */ for (thr = 0; thr < nr_threads; thr++) { init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; pr_err("Cannot start compression threads\n"); ret = -ENOMEM; goto out_clean; } } /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); init_waitqueue_head(&crc->done); handle->crc32 = 0; crc->crc32 = &handle->crc32; for (thr = 0; thr < nr_threads; thr++) { crc->unc[thr] = data[thr].unc; crc->unc_len[thr] = &data[thr].unc_len; } crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } /* * Adjust the number of required free pages after all allocations have * been done. We don't want to run out of pages when writing. */ handle->reqd_free_pages = reqd_free_pages(); pr_info("Using %u thread(s) for %s compression\n", nr_threads, hib_comp_algo); pr_info("Compressing and saving image data (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { for (off = 0; off < UNC_SIZE; off += PAGE_SIZE) { ret = snapshot_read_next(snapshot); if (ret < 0) goto out_finish; if (!ret) break; memcpy(data[thr].unc + off, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) pr_info("Image saving progress: %3d%%\n", nr_pages / m * 10); nr_pages++; } if (!off) break; data[thr].unc_len = off; atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } if (!thr) break; crc->run_threads = thr; atomic_set_release(&crc->ready, 1); wake_up(&crc->go); for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { pr_err("%s compression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > bytes_worst_compress(data[thr].unc_len))) { pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } *(size_t *)data[thr].cmp = data[thr].cmp_len; /* * Given we are writing one page at a time to disk, we * copy that much from the buffer, although the last * bit will likely be smaller than full page. This is * OK - we saved the length of the compressed data, so * any garbage at the end will be discarded when we * read it. */ for (off = 0; off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(page, data[thr].cmp + off, PAGE_SIZE); ret = swap_write_page(handle, page, &hb); if (ret) goto out_finish; } } wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } out_finish: err2 = hib_wait_io(&hb); stop = ktime_get(); if (!ret) ret = err2; if (!ret) pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); pr_info("Image size after compression: %d kbytes\n", (atomic_read(&compressed_size) / 1024)); out_clean: hib_finish_batch(&hb); if (crc) { if (crc->thr) kthread_stop(crc->thr); kfree(crc); } if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); if (data[thr].cc) crypto_free_comp(data[thr].cc); } vfree(data); } if (page) free_page((unsigned long)page); return ret; } /** * enough_swap - Make sure we have enough swap to save the image. * * Returns TRUE or FALSE after checking the total amount of swap * space available from the resume partition. */ static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; pr_debug("Free swap pages: %u\n", free_swap); required = PAGES_FOR_IO + nr_pages; return free_swap > required; } /** * swsusp_write - Write entire image and metadata. * @flags: flags to pass to the "boot" kernel in the image header * * It is important _NOT_ to umount filesystems at this point. We want * them synced (in case something goes wrong) but we DO not want to mark * filesystem clean: it is not. (And it does not matter, if we resume * correctly, we'll mark system clean, anyway.) */ int swsusp_write(unsigned int flags) { struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; unsigned long pages; int error; pages = snapshot_get_image_size(); error = get_swap_writer(&handle); if (error) { pr_err("Cannot get swap writer\n"); return error; } if (flags & SF_NOCOMPRESS_MODE) { if (!enough_swap(pages)) { pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; } } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_read_next(&snapshot); if (error < (int)PAGE_SIZE) { if (error >= 0) error = -EFAULT; goto out_finish; } header = (struct swsusp_info *)data_of(snapshot); error = swap_write_page(&handle, header, NULL); if (!error) { error = (flags & SF_NOCOMPRESS_MODE) ? save_image(&handle, &snapshot, pages - 1) : save_compressed_image(&handle, &snapshot, pages - 1); } out_finish: error = swap_writer_finish(&handle, flags, error); return error; } /* * The following functions allow us to read data using a swap map * in a file-like way. */ static void release_swap_reader(struct swap_map_handle *handle) { struct swap_map_page_list *tmp; while (handle->maps) { if (handle->maps->map) free_page((unsigned long)handle->maps->map); tmp = handle->maps; handle->maps = handle->maps->next; kfree(tmp); } handle->cur = NULL; } static int get_swap_reader(struct swap_map_handle *handle, unsigned int *flags_p) { int error; struct swap_map_page_list *tmp, *last; sector_t offset; *flags_p = swsusp_header->flags; if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; handle->cur = NULL; last = handle->maps = NULL; offset = swsusp_header->image; while (offset) { tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); if (!tmp) { release_swap_reader(handle); return -ENOMEM; } if (!handle->maps) handle->maps = tmp; if (last) last->next = tmp; last = tmp; tmp->map = (struct swap_map_page *) __get_free_page(GFP_NOIO | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; } error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; } offset = tmp->map->next_swap; } handle->k = 0; handle->cur = handle->maps->map; return 0; } static int swap_read_page(struct swap_map_handle *handle, void *buf, struct hib_bio_batch *hb) { sector_t offset; int error; struct swap_map_page_list *tmp; if (!handle->cur) return -EINVAL; offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; error = hib_submit_io(REQ_OP_READ, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { handle->k = 0; free_page((unsigned long)handle->maps->map); tmp = handle->maps; handle->maps = handle->maps->next; kfree(tmp); if (!handle->maps) release_swap_reader(handle); else handle->cur = handle->maps->map; } return error; } static int swap_reader_finish(struct swap_map_handle *handle) { release_swap_reader(handle); return 0; } /** * load_image - load the image using the swap map handle * @handle and the snapshot handle @snapshot * (assume there are @nr_pages pages to load) */ static int load_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) { unsigned int m; int ret = 0; ktime_t start; ktime_t stop; struct hib_bio_batch hb; int err2; unsigned nr_pages; hib_init_batch(&hb); clean_pages_on_read = true; pr_info("Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) break; ret = swap_read_page(handle, data_of(*snapshot), &hb); if (ret) break; if (snapshot->sync_read) ret = hib_wait_io(&hb); if (ret) break; if (!(nr_pages % m)) pr_info("Image loading progress: %3d%%\n", nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; if (!ret) { pr_info("Image loading done\n"); ret = snapshot_write_finalize(snapshot); if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; } swsusp_show_speed(start, stop, nr_to_read, "Read"); return ret; } /* * Structure used for data decompression. */ struct dec_data { struct task_struct *thr; /* thread */ struct crypto_comp *cc; /* crypto compressor stream */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ wait_queue_head_t go; /* start decompression */ wait_queue_head_t done; /* decompression done */ size_t unc_len; /* uncompressed length */ size_t cmp_len; /* compressed length */ unsigned char unc[UNC_SIZE]; /* uncompressed buffer */ unsigned char cmp[CMP_SIZE]; /* compressed buffer */ }; /* * Decompression function that runs in its own thread. */ static int decompress_threadfn(void *data) { struct dec_data *d = data; unsigned int unc_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || kthread_should_stop()); if (kthread_should_stop()) { d->thr = NULL; d->ret = -1; atomic_set_release(&d->stop, 1); wake_up(&d->done); break; } atomic_set(&d->ready, 0); unc_len = UNC_SIZE; d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len, d->unc, &unc_len); d->unc_len = unc_len; if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, (unsigned long)d->unc + d->unc_len); atomic_set_release(&d->stop, 1); wake_up(&d->done); } return 0; } /** * load_compressed_image - Load compressed image data and decompress it. * @handle: Swap map handle to use for loading data. * @snapshot: Image to copy uncompressed data into. * @nr_to_read: Number of pages to load. */ static int load_compressed_image(struct swap_map_handle *handle, struct snapshot_handle *snapshot, unsigned int nr_to_read) { unsigned int m; int ret = 0; int eof = 0; struct hib_bio_batch hb; ktime_t start; ktime_t stop; unsigned nr_pages; size_t off; unsigned i, thr, run_threads, nr_threads; unsigned ring = 0, pg = 0, ring_size = 0, have = 0, want, need, asked = 0; unsigned long read_pages = 0; unsigned char **page = NULL; struct dec_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); /* * We'll limit the number of threads for decompression to limit memory * footprint. */ nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, CMP_THREADS); page = vmalloc(array_size(CMP_MAX_RD_PAGES, sizeof(*page))); if (!page) { pr_err("Failed to allocate %s page\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } data = vzalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate %s data\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } crc = kzalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } clean_pages_on_decompress = true; /* * Start the decompression threads. */ for (thr = 0; thr < nr_threads; thr++) { init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; pr_err("Cannot start decompression threads\n"); ret = -ENOMEM; goto out_clean; } } /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); init_waitqueue_head(&crc->done); handle->crc32 = 0; crc->crc32 = &handle->crc32; for (thr = 0; thr < nr_threads; thr++) { crc->unc[thr] = data[thr].unc; crc->unc_len[thr] = &data[thr].unc_len; } crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } /* * Set the number of pages for read buffering. * This is complete guesswork, because we'll only know the real * picture once prepare_image() is called, which is much later on * during the image load phase. We'll assume the worst case and * say that none of the image pages are from high memory. */ if (low_free_pages() > snapshot_get_image_size()) read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; read_pages = clamp_val(read_pages, CMP_MIN_RD_PAGES, CMP_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < CMP_PAGES ? GFP_NOIO | __GFP_HIGH : GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { if (i < CMP_PAGES) { ring_size = i; pr_err("Failed to allocate %s pages\n", hib_comp_algo); ret = -ENOMEM; goto out_clean; } else { break; } } } want = ring_size = i; pr_info("Using %u thread(s) for %s decompression\n", nr_threads, hib_comp_algo); pr_info("Loading and decompressing image data (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; nr_pages = 0; start = ktime_get(); ret = snapshot_write_next(snapshot); if (ret <= 0) goto out_finish; for(;;) { for (i = 0; !eof && i < want; i++) { ret = swap_read_page(handle, page[ring], &hb); if (ret) { /* * On real read error, finish. On end of data, * set EOF flag and just exit the read loop. */ if (handle->cur && handle->cur->entries[handle->k]) { goto out_finish; } else { eof = 1; break; } } if (++ring >= ring_size) ring = 0; } asked += i; want -= i; /* * We are out of data, wait for some more. */ if (!have) { if (!asked) break; ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; asked = 0; if (eof) eof = 2; } if (crc->run_threads) { wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); crc->run_threads = 0; } for (thr = 0; have && thr < nr_threads; thr++) { data[thr].cmp_len = *(size_t *)page[pg]; if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > bytes_worst_compress(UNC_SIZE))) { pr_err("Invalid %s compressed length\n", hib_comp_algo); ret = -1; goto out_finish; } need = DIV_ROUND_UP(data[thr].cmp_len + CMP_HEADER, PAGE_SIZE); if (need > have) { if (eof > 1) { ret = -1; goto out_finish; } break; } for (off = 0; off < CMP_HEADER + data[thr].cmp_len; off += PAGE_SIZE) { memcpy(data[thr].cmp + off, page[pg], PAGE_SIZE); have--; want++; if (++pg >= ring_size) pg = 0; } atomic_set_release(&data[thr].ready, 1); wake_up(&data[thr].go); } /* * Wait for more data while we are decompressing. */ if (have < CMP_PAGES && asked) { ret = hib_wait_io(&hb); if (ret) goto out_finish; have += asked; asked = 0; if (eof) eof = 2; } for (run_threads = thr, thr = 0; thr < run_threads; thr++) { wait_event(data[thr].done, atomic_read_acquire(&data[thr].stop)); atomic_set(&data[thr].stop, 0); ret = data[thr].ret; if (ret < 0) { pr_err("%s decompression failed\n", hib_comp_algo); goto out_finish; } if (unlikely(!data[thr].unc_len || data[thr].unc_len > UNC_SIZE || data[thr].unc_len & (PAGE_SIZE - 1))) { pr_err("Invalid %s uncompressed length\n", hib_comp_algo); ret = -1; goto out_finish; } for (off = 0; off < data[thr].unc_len; off += PAGE_SIZE) { memcpy(data_of(*snapshot), data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) pr_info("Image loading progress: %3d%%\n", nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); if (ret <= 0) { crc->run_threads = thr + 1; atomic_set_release(&crc->ready, 1); wake_up(&crc->go); goto out_finish; } } } crc->run_threads = thr; atomic_set_release(&crc->ready, 1); wake_up(&crc->go); } out_finish: if (crc->run_threads) { wait_event(crc->done, atomic_read_acquire(&crc->stop)); atomic_set(&crc->stop, 0); } stop = ktime_get(); if (!ret) { pr_info("Image loading done\n"); ret = snapshot_write_finalize(snapshot); if (!ret && !snapshot_image_loaded(snapshot)) ret = -ENODATA; if (!ret) { if (swsusp_header->flags & SF_CRC32_MODE) { if(handle->crc32 != swsusp_header->crc32) { pr_err("Invalid image CRC32!\n"); ret = -ENODATA; } } } } swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); if (crc) { if (crc->thr) kthread_stop(crc->thr); kfree(crc); } if (data) { for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); if (data[thr].cc) crypto_free_comp(data[thr].cc); } vfree(data); } vfree(page); return ret; } /** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memory location */ int swsusp_read(unsigned int *flags_p) { int error; struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot); if (error < (int)PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); error = get_swap_reader(&handle, flags_p); if (error) goto end; if (!error) error = swap_read_page(&handle, header, NULL); if (!error) { error = (*flags_p & SF_NOCOMPRESS_MODE) ? load_image(&handle, &snapshot, header->pages - 1) : load_compressed_image(&handle, &snapshot, header->pages - 1); } swap_reader_finish(&handle); end: if (!error) pr_debug("Image successfully loaded\n"); else pr_debug("Error %d resuming\n", error); return error; } static void *swsusp_holder; /** * swsusp_check - Open the resume device and check for the swsusp signature. * @exclusive: Open the resume device exclusively. */ int swsusp_check(bool exclusive) { void *holder = exclusive ? &swsusp_holder : NULL; int error; hib_resume_bdev_file = bdev_file_open_by_dev(swsusp_resume_device, BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (error) goto put; if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; } if (!error && swsusp_header->flags & SF_HW_SIG && swsusp_header->hw_sig != swsusp_hardware_signature) { pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n", swsusp_header->hw_sig, swsusp_hardware_signature); error = -EINVAL; } put: if (error) bdev_fput(hib_resume_bdev_file); else pr_debug("Image signature found, resuming\n"); } else { error = PTR_ERR(hib_resume_bdev_file); } if (error) pr_debug("Image not found (code %d)\n", error); return error; } /** * swsusp_close - close resume device. */ void swsusp_close(void) { if (IS_ERR(hib_resume_bdev_file)) { pr_debug("Image device not initialised\n"); return; } fput(hib_resume_bdev_file); } /** * swsusp_unmark - Unmark swsusp signature in the resume device */ #ifdef CONFIG_SUSPEND int swsusp_unmark(void) { int error; hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; } /* * We just returned from suspend, we don't need the image any more. */ free_all_swap_pages(root_swap); return error; } #endif static int __init swsusp_header_init(void) { swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); if (!swsusp_header) panic("Could not allocate memory for swsusp_header\n"); return 0; } core_initcall(swsusp_header_init);
7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* * linux/fs/nls/nls_iso8859-13.c * * Charset iso8859-13 translation tables. * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7, 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6, /* 0xb0*/ 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7, 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6, /* 0xc0*/ 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112, 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b, /* 0xd0*/ 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7, 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df, /* 0xe0*/ 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113, 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c, /* 0xf0*/ 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7, 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0x00, 0xa2, 0xa3, 0xa4, 0x00, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0x00, 0xa9, 0x00, 0xab, 0xac, 0xad, 0xae, 0x00, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0x00, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0x00, 0xb9, 0x00, 0xbb, 0xbc, 0xbd, 0xbe, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0xc4, 0xc5, 0xaf, 0x00, /* 0xc0-0xc7 */ 0x00, 0xc9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xd3, 0x00, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xa8, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0xe4, 0xe5, 0xbf, 0x00, /* 0xe0-0xe7 */ 0x00, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xf3, 0x00, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xb8, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0xc2, 0xe2, 0x00, 0x00, 0xc0, 0xe0, 0xc3, 0xe3, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0xc7, 0xe7, 0x00, 0x00, 0xcb, 0xeb, /* 0x10-0x17 */ 0xc6, 0xe6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xcc, 0xec, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0xce, 0xee, 0x00, 0x00, 0xc1, 0xe1, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xcd, 0xed, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0xcf, 0xef, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0xd9, 0xf9, 0xd1, 0xf1, 0xd2, 0xf2, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0xd4, 0xf4, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0xba, /* 0x50-0x57 */ 0x00, 0x00, 0xda, 0xfa, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xd0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0xdb, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0xd8, 0xf8, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0xca, 0xea, 0xdd, 0xfd, 0xde, 0xfe, 0x00, /* 0x78-0x7f */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xff, 0x00, 0x00, 0xb4, 0xa1, 0xa5, 0x00, /* 0x18-0x1f */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-13", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_13(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_13(void) { unregister_nls(&table); } module_init(init_nls_iso8859_13) module_exit(exit_nls_iso8859_13) MODULE_DESCRIPTION("NLS ISO 8859-13 (Latin 7; Baltic)"); MODULE_LICENSE("Dual BSD/GPL");
812 8 818 818 5 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Credential hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/cred.h> #include <linux/lsm_hooks.h> #include "common.h" #include "cred.h" #include "ruleset.h" #include "setup.h" static void hook_cred_transfer(struct cred *const new, const struct cred *const old) { struct landlock_ruleset *const old_dom = landlock_cred(old)->domain; if (old_dom) { landlock_get_ruleset(old_dom); landlock_cred(new)->domain = old_dom; } } static int hook_cred_prepare(struct cred *const new, const struct cred *const old, const gfp_t gfp) { hook_cred_transfer(new, old); return 0; } static void hook_cred_free(struct cred *const cred) { struct landlock_ruleset *const dom = landlock_cred(cred)->domain; if (dom) landlock_put_ruleset_deferred(dom); } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_transfer, hook_cred_transfer), LSM_HOOK_INIT(cred_free, hook_cred_free), }; __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); }
29 29 29 27 10 5 10 5 10 9 4 4 9 10 10 6 4 10 5 7 5 10 10 10 10 9 10 1 15 9 5 5 10 10 2 10 10 10 10 10 2 10 2 10 10 10 9 10 10 10 19 18 6 1 5 5 17 1 5 5 5 5 5 28 1 1 26 3 29 29 29 50 45 3 16 3 3 2 44 2 41 31 4 7 2 3 2 12 6 4 3 1 1 54 4 17 1 79 82 69 12 2 2 2 6 34 2 8 4 2 6 3 19 8 18 1 35 1 1 1 55 2 1 50 1 6 31 12 42 34 36 6 14 29 28 29 11 25 2 1 1 1 28 29 28 56 54 39 17 4 4 4 2 4 3 4 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/gfp.h> #include <net/sock.h> #include <linux/in.h> #include <linux/list.h> #include <linux/ratelimit.h> #include <linux/export.h> #include <linux/sizes.h> #include "rds.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog * will kick our shin. * Also, it seems fairer to not let one busy connection stall all the * others. * * send_batch_count is the number of times we'll loop in send_xmit. Setting * it to 0 will restore the old behavior (where we looped until we had * drained the queue). */ static int send_batch_count = SZ_1K; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); static void rds_send_remove_from_sock(struct list_head *messages, int status); /* * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). */ void rds_send_path_reset(struct rds_conn_path *cp) { struct rds_message *rm, *tmp; unsigned long flags; if (cp->cp_xmit_rm) { rm = cp->cp_xmit_rm; cp->cp_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ rds_message_unmapped(rm); rds_message_put(rm); } cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_data_sent = 0; cp->cp_conn->c_map_queued = 0; cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; /* Mark messages as retransmissions, and move them to the send q */ spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); } list_splice_init(&cp->cp_retrans, &cp->cp_send_queue); spin_unlock_irqrestore(&cp->cp_lock, flags); } EXPORT_SYMBOL_GPL(rds_send_path_reset); static int acquire_in_xmit(struct rds_conn_path *cp) { return test_and_set_bit_lock(RDS_IN_XMIT, &cp->cp_flags) == 0; } static void release_in_xmit(struct rds_conn_path *cp) { clear_bit_unlock(RDS_IN_XMIT, &cp->cp_flags); /* * We don't use wait_on_bit()/wake_up_bit() because our waking is in a * hot path and finding waiters is very rare. We don't want to walk * the system-wide hashed waitqueue buckets in the fast path only to * almost never find waiters. */ if (waitqueue_active(&cp->cp_waitq)) wake_up_all(&cp->cp_waitq); } /* * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: * - tx queueing is a simple fifo list * - reassembly is optional and easily done by transports per conn * - no per flow rx lookup at all, straight to the socket * - less per-frag memory and wire overhead * Con: * - queued acks can be delayed behind large messages * Depends: * - small message latency is higher behind queued large messages * - large message latency isn't starved by intervening small sends */ int rds_send_xmit(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; struct rds_message *rm; unsigned long flags; unsigned int tmp; struct scatterlist *sg; int ret = 0; LIST_HEAD(to_be_dropped); int batch_count; unsigned long send_gen = 0; int same_rm = 0; restart: batch_count = 0; /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. */ if (!acquire_in_xmit(cp)) { rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } if (rds_destroy_pending(cp->cp_conn)) { release_in_xmit(cp); ret = -ENETUNREACH; /* dont requeue send work */ goto out; } /* * we record the send generation after doing the xmit acquire. * if someone else manages to jump in and do some work, we'll use * this to avoid a goto restart farther down. * * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ send_gen = READ_ONCE(cp->cp_send_gen) + 1; WRITE_ONCE(cp->cp_send_gen, send_gen); /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. */ if (!rds_conn_path_up(cp)) { release_in_xmit(cp); ret = 0; goto out; } if (conn->c_trans->xmit_path_prepare) conn->c_trans->xmit_path_prepare(cp); /* * spin trying to push headers and data down the connection until * the connection doesn't make forward progress. */ while (1) { rm = cp->cp_xmit_rm; if (!rm) { same_rm = 0; } else { same_rm++; if (same_rm >= 4096) { rds_stats_inc(s_send_stuck_rm); ret = -EAGAIN; break; } } /* * If between sending messages, we can send a pending congestion * map update. */ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { rm = rds_cong_update_alloc(conn); if (IS_ERR(rm)) { ret = PTR_ERR(rm); break; } rm->data.op_active = 1; rm->m_inc.i_conn_path = cp; rm->m_inc.i_conn = cp->cp_conn; cp->cp_xmit_rm = rm; } /* * If not already working on one, grab the next message. * * cp_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ if (!rm) { unsigned int len; batch_count++; /* we want to process as big a batch as we can, but * we also want to avoid softlockups. If we've been * through a lot of messages, lets back off and see * if anyone else jumps in */ if (batch_count >= send_batch_count) goto over_batch; spin_lock_irqsave(&cp->cp_lock, flags); if (!list_empty(&cp->cp_send_queue)) { rm = list_entry(cp->cp_send_queue.next, struct rds_message, m_conn_item); rds_message_addref(rm); /* * Move the message from the send queue to the retransmit * list right away. */ list_move_tail(&rm->m_conn_item, &cp->cp_retrans); } spin_unlock_irqrestore(&cp->cp_lock, flags); if (!rm) break; /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire * queue pair to error state. We could possibly * recover from that, but right now we drop the * connection. * Therefore, we never retransmit messages with RDMA ops. */ if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); if (cp->cp_unacked_packets == 0 || cp->cp_unacked_bytes < len) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); cp->cp_unacked_packets = rds_sysctl_max_unacked_packets; cp->cp_unacked_bytes = rds_sysctl_max_unacked_bytes; rds_stats_inc(s_send_ack_required); } else { cp->cp_unacked_bytes -= len; cp->cp_unacked_packets--; } cp->cp_xmit_rm = rm; } /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !cp->cp_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_rdma_sent = 1; } if (rm->atomic.op_active && !cp->cp_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); if (ret) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); wake_up_interruptible(&rm->m_flush_wait); break; } cp->cp_xmit_atomic_sent = 1; } /* * A number of cases require an RDS header to be sent * even if there is no data. * We permit 0-byte sends; rds-ping depends on this. * However, if there are exclusively attached silent ops, * we skip the hdr/data send, to enable silent operation. */ if (rm->data.op_nents == 0) { int ops_present; int all_ops_are_silent = 1; ops_present = (rm->atomic.op_active || rm->rdma.op_active); if (rm->atomic.op_active && !rm->atomic.op_silent) all_ops_are_silent = 0; if (rm->rdma.op_active && !rm->rdma.op_silent) all_ops_are_silent = 0; if (ops_present && all_ops_are_silent && !rm->m_rdma_cookie) rm->data.op_active = 0; } if (rm->data.op_active && !cp->cp_xmit_data_sent) { rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, cp->cp_xmit_hdr_off, cp->cp_xmit_sg, cp->cp_xmit_data_off); if (ret <= 0) break; if (cp->cp_xmit_hdr_off < sizeof(struct rds_header)) { tmp = min_t(int, ret, sizeof(struct rds_header) - cp->cp_xmit_hdr_off); cp->cp_xmit_hdr_off += tmp; ret -= tmp; } sg = &rm->data.op_sg[cp->cp_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - cp->cp_xmit_data_off); cp->cp_xmit_data_off += tmp; ret -= tmp; if (cp->cp_xmit_data_off == sg->length) { cp->cp_xmit_data_off = 0; sg++; cp->cp_xmit_sg++; BUG_ON(ret != 0 && cp->cp_xmit_sg == rm->data.op_nents); } } if (cp->cp_xmit_hdr_off == sizeof(struct rds_header) && (cp->cp_xmit_sg == rm->data.op_nents)) cp->cp_xmit_data_sent = 1; } /* * A rm will only take multiple times through this loop * if there is a data op. Thus, if the data is sent (or there was * none), then we're done with the rm. */ if (!rm->data.op_active || cp->cp_xmit_data_sent) { cp->cp_xmit_rm = NULL; cp->cp_xmit_sg = 0; cp->cp_xmit_hdr_off = 0; cp->cp_xmit_data_off = 0; cp->cp_xmit_rdma_sent = 0; cp->cp_xmit_atomic_sent = 0; cp->cp_xmit_data_sent = 0; rds_message_put(rm); } } over_batch: if (conn->c_trans->xmit_path_complete) conn->c_trans->xmit_path_complete(cp); release_in_xmit(cp); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { /* irqs on here, so we can put(), unlike above */ list_for_each_entry(rm, &to_be_dropped, m_conn_item) rds_message_put(rm); rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } /* * Other senders can queue a message after we last test the send queue * but before we clear RDS_IN_XMIT. In that case they'd back off and * not try and send their newly queued message. We need to check the * send queue after having cleared RDS_IN_XMIT so that their message * doesn't get stuck on the send queue. * * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. * * We have an extra generation check here so that if someone manages * to jump in after our release_in_xmit, we'll see that they have done * some work and we will skip our goto */ if (ret == 0) { bool raced; smp_mb(); raced = send_gen != READ_ONCE(cp->cp_send_gen); if ((test_bit(0, &conn->c_map_queued) || !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); } } out: return ret; } EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len); assert_spin_locked(&rs->rs_lock); BUG_ON(rs->rs_snd_bytes < len); rs->rs_snd_bytes -= len; if (rs->rs_snd_bytes == 0) rds_stats_inc(s_send_queue_empty); } static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, is_acked_func is_acked) { if (is_acked) return is_acked(rm, ack); return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; } /* * This is pretty similar to what happens below in the ACK * handling code - except that we call here as soon as we get * the IB send completion on the RDMA op and the accompanying * message. */ void rds_rdma_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ro->op_active && ro->op_notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(&notifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ro->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* * Just like above, except looks at atomic op */ void rds_atomic_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; struct rm_atomic_op *ao; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); ao = &rm->atomic; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ao->op_active && ao->op_notify && ao->op_notifier) { notifier = ao->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; spin_lock(&rs->rs_lock); list_add_tail(&notifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); ao->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_atomic_send_complete); /* * This is the same as rds_rdma_send_complete except we * don't do any locking - we have all the ingredients (message, * socket, socket lock) and can just move the notifier. */ static inline void __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { struct rm_rdma_op *ro; struct rm_atomic_op *ao; ro = &rm->rdma; if (ro->op_active && ro->op_notify && ro->op_notifier) { ro->op_notifier->n_status = status; list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); ro->op_notifier = NULL; } ao = &rm->atomic; if (ao->op_active && ao->op_notify && ao->op_notifier) { ao->op_notifier->n_status = status; list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); ao->op_notifier = NULL; } /* No need to wake the app - caller does this */ } /* * This removes messages from the socket's list if they're on it. The list * argument must be private to the caller, we must be able to modify it * without locks. The messages must have a reference held for their * position on the list. This function will drop that reference after * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ static void rds_send_remove_from_sock(struct list_head *messages, int status) { unsigned long flags; struct rds_sock *rs = NULL; struct rds_message *rm; while (!list_empty(messages)) { int was_on_sock = 0; rm = list_entry(messages->next, struct rds_message, m_conn_item); list_del_init(&rm->m_conn_item); /* * If we see this flag cleared then we're *sure* that someone * else beat us to removing it from the sock. If we race * with their flag update we'll get the lock and then really * see that the flag has been cleared. * * The message spinlock makes sure nobody clears rm->m_rs * while we're messing with it. It does not prevent the * message from being removed from the socket, though. */ spin_lock_irqsave(&rm->m_rs_lock, flags); if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) goto unlock_and_drop; if (rs != rm->m_rs) { if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; if (rs) sock_hold(rds_rs_to_sk(rs)); } if (!rs) goto unlock_and_drop; spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); if (ro->op_active && ro->op_notifier && (ro->op_notify || (ro->op_recverr && status))) { notifier = ro->op_notifier; list_add_tail(&notifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; rm->rdma.op_notifier = NULL; } was_on_sock = 1; } spin_unlock(&rs->rs_lock); unlock_and_drop: spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); if (was_on_sock) rds_message_put(rm); } if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } /* * Transports call here when they've determined that the receiver queued * messages up to, and including, the given sequence number. Messages are * moved to the retrans queue when rds_send_xmit picks them off the send * queue. This means that in the TCP case, the message may not have been * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked * checks the RDS_MSG_HAS_ACK_SEQ bit. */ void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked) { struct rds_message *rm, *tmp; unsigned long flags; LIST_HEAD(list); spin_lock_irqsave(&cp->cp_lock, flags); list_for_each_entry_safe(rm, tmp, &cp->cp_retrans, m_conn_item) { if (!rds_send_is_acked(rm, ack, is_acked)) break; list_move(&rm->m_conn_item, &list); clear_bit(RDS_MSG_ON_CONN, &rm->m_flags); } /* order flag updates with spin locks */ if (!list_empty(&list)) smp_mb__after_atomic(); spin_unlock_irqrestore(&cp->cp_lock, flags); /* now remove the messages from the sock list as needed */ rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); } EXPORT_SYMBOL_GPL(rds_send_path_drop_acked); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked) { WARN_ON(conn->c_trans->t_mp_capable); rds_send_path_drop_acked(&conn->c_path[0], ack, is_acked); } EXPORT_SYMBOL_GPL(rds_send_drop_acked); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; struct rds_conn_path *cp; unsigned long flags; LIST_HEAD(list); /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { if (dest && (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || dest->sin6_port != rm->m_inc.i_hdr.h_dport)) continue; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); } /* order flag updates with the rs lock */ smp_mb__after_atomic(); spin_unlock_irqrestore(&rs->rs_lock, flags); if (list_empty(&list)) return; /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { conn = rm->m_inc.i_conn; if (conn->c_trans->t_mp_capable) cp = rm->m_inc.i_conn_path; else cp = &conn->c_path[0]; spin_lock_irqsave(&cp->cp_lock, flags); /* * Maybe someone else beat us to removing rm from the conn. * If we race with their flag update we'll get the lock and * then really see that the flag has been cleared. */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); continue; } list_del_init(&rm->m_conn_item); spin_unlock_irqrestore(&cp->cp_lock, flags); /* * Couldn't grab m_rs_lock in top loop (lock ordering), * but we can now. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); rds_message_wait(rm); /* just in case the code above skipped this message * because RDS_MSG_ON_CONN wasn't set, run it again here * taking m_rs_lock is the only thing that keeps us * from racing with ack processing. */ spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); } } /* * we only want this to fire once so we use the callers 'queued'. It's * possible that another thread can race with us and remove the * message from the flow with RDS_CANCEL_SENT_TO. */ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, struct rds_conn_path *cp, struct rds_message *rm, __be16 sport, __be16 dport, int *queued) { unsigned long flags; u32 len; if (*queued) goto out; len = be32_to_cpu(rm->m_inc.i_hdr.h_len); /* this is the only place which holds both the socket's rs_lock * and the connection's c_lock */ spin_lock_irqsave(&rs->rs_lock, flags); /* * If there is a little space in sndbuf, we don't queue anything, * and userspace gets -EAGAIN. But poll() indicates there's send * room. This can lead to bad behavior (spinning) if snd_bytes isn't * freed up by incoming acks. So we check the *old* value of * rs_snd_bytes here to allow the last msg to exceed the buffer, * and poll() now knows no more data can be sent. */ if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) { rs->rs_snd_bytes += len; /* let recv side know we are close to send space exhaustion. * This is probably not the optimal way to do it, as this * means we set the flag on *all* messages as soon as our * throughput hits a certain threshold. */ if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2) set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're trying to minimize the time we hold c_lock */ rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0); rm->m_inc.i_conn = conn; rm->m_inc.i_conn_path = cp; rds_message_addref(rm); spin_lock(&cp->cp_lock); rm->m_inc.i_hdr.h_sequence = cpu_to_be64(cp->cp_next_tx_seq++); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); spin_unlock(&cp->cp_lock); rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", rm, len, rs, rs->rs_snd_bytes, (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence)); *queued = 1; } spin_unlock_irqrestore(&rs->rs_lock, flags); out: return *queued; } /* * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ static int rds_rm_size(struct msghdr *msg, int num_sgs, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; bool zcopy_cookie = false; struct rds_iov_vector *iov, *tmp_iov; if (num_sgs < 0) return -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (vct->indx >= vct->len) { vct->len += vct->incr; tmp_iov = krealloc(vct->vec, vct->len * sizeof(struct rds_iov_vector), GFP_KERNEL); if (!tmp_iov) { vct->len -= vct->incr; return -ENOMEM; } vct->vec = tmp_iov; } iov = &vct->vec[vct->indx]; memset(iov, 0, sizeof(struct rds_iov_vector)); vct->indx++; cmsg_groups |= 1; retval = rds_rdma_extra_size(CMSG_DATA(cmsg), iov); if (retval < 0) return retval; size += retval; break; case RDS_CMSG_ZCOPY_COOKIE: zcopy_cookie = true; fallthrough; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; /* these are valid but do no add any size */ break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: cmsg_groups |= 1; size += sizeof(struct scatterlist); break; default: return -EINVAL; } } if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) return -EINVAL; size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) return -EINVAL; return size; } static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { u32 *cookie; if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) || !rm->data.op_mmp_znotifier) return -EINVAL; cookie = CMSG_DATA(cmsg); rm->data.op_mmp_znotifier->z_cookie = *cookie; return 0; } static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr, struct rds_iov_vector_arr *vct) { struct cmsghdr *cmsg; int ret = 0, ind = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: if (ind >= vct->indx) return -ENOMEM; ret = rds_cmsg_rdma_args(rs, rm, cmsg, &vct->vec[ind]); ind++; break; case RDS_CMSG_RDMA_DEST: ret = rds_cmsg_rdma_dest(rs, rm, cmsg); break; case RDS_CMSG_RDMA_MAP: ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; else if (ret == -ENODEV) /* Accommodate the get_mr() case which can fail * if connection isn't established yet. */ ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: case RDS_CMSG_MASKED_ATOMIC_CSWP: case RDS_CMSG_MASKED_ATOMIC_FADD: ret = rds_cmsg_atomic(rs, rm, cmsg); break; case RDS_CMSG_ZCOPY_COOKIE: ret = rds_cmsg_zcopy(rs, rm, cmsg); break; default: return -EINVAL; } if (ret) break; } return ret; } static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn, int nonblock) { int hash; if (conn->c_npaths == 0) hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); else hash = RDS_MPATH_HASH(rs, conn->c_npaths); if (conn->c_npaths == 0 && hash != 0) { rds_send_ping(conn, 0); /* The underlying connection is not up yet. Need to wait * until it is up to be sure that the non-zero c_path can be * used. But if we are interrupted, we have to use the zero * c_path in case the connection ends up being non-MP capable. */ if (conn->c_npaths == 0) { /* Cannot wait for the connection be made, so just use * the base c_path. */ if (nonblock) return 0; if (wait_event_interruptible(conn->c_hs_waitq, conn->c_npaths != 0)) hash = 0; } if (conn->c_npaths == 1) hash = 0; } return hash; } static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) { struct rds_rdma_args *args; struct cmsghdr *cmsg; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_RDS) continue; if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))) return -EINVAL; args = CMSG_DATA(cmsg); *rdma_bytes += args->remote_vec.bytes; } } return 0; } int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); __be16 dport; struct rds_message *rm = NULL; struct rds_connection *conn; int ret = 0; int queued = 0, allocated_mr = 0; int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; struct in6_addr daddr; __u32 scope_id = 0; size_t rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); int namelen; struct rds_iov_vector_arr vct; int ind; memset(&vct, 0, sizeof(vct)); /* expect 1 RDMA CMSG per rds_sendmsg. can still grow if more needed. */ vct.incr = 1; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } namelen = msg->msg_namelen; if (namelen != 0) { if (namelen < sizeof(*usin)) { ret = -EINVAL; goto out; } switch (usin->sin_family) { case AF_INET: if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(usin->sin_addr.s_addr)) { ret = -EINVAL; goto out; } ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); dport = usin->sin_port; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { int addr_type; if (namelen < sizeof(*sin6)) { ret = -EINVAL; goto out; } addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) { ret = -EINVAL; goto out; } /* It is a mapped address. Need to do some * sanity checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) { ret = -EINVAL; goto out; } } if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) { ret = -EINVAL; goto out; } scope_id = sin6->sin6_scope_id; } daddr = sin6->sin6_addr; dport = sin6->sin6_port; break; } #endif default: ret = -EINVAL; goto out; } } else { /* We only care about consistency with ->connect() */ lock_sock(sk); daddr = rs->rs_conn_addr; dport = rs->rs_conn_port; scope_id = rs->rs_bound_scope_id; release_sock(sk); } lock_sock(sk); if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { release_sock(sk); ret = -ENOTCONN; goto out; } else if (namelen != 0) { /* Cannot send to an IPv4 address using an IPv6 source * address and cannot send to an IPv6 address using an * IPv4 source address. */ if (ipv6_addr_v4mapped(&daddr) ^ ipv6_addr_v4mapped(&rs->rs_bound_addr)) { release_sock(sk); ret = -EOPNOTSUPP; goto out; } /* If the socket is already bound to a link local address, * it can only send to peers on the same link. But allow * communicating between link local and non-link local address. */ if (scope_id != rs->rs_bound_scope_id) { if (!scope_id) { scope_id = rs->rs_bound_scope_id; } else if (rs->rs_bound_scope_id) { release_sock(sk); ret = -EINVAL; goto out; } } } release_sock(sk); ret = rds_rdma_bytes(msg, &rdma_payload_len); if (ret) goto out; if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { ret = -EMSGSIZE; goto out; } if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; goto out; } if (zcopy) { if (rs->rs_transport->t_type != RDS_TRANS_TCP) { ret = -EOPNOTSUPP; goto out; } num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); } /* size of rm including all sgs */ ret = rds_rm_size(msg, num_sgs, &vct); if (ret < 0) goto out; rm = rds_message_alloc(ret, GFP_KERNEL); if (!rm) { ret = -ENOMEM; goto out; } /* Attach data to the rm */ if (payload_len) { rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (IS_ERR(rm->data.op_sg)) { ret = PTR_ERR(rm->data.op_sg); goto out; } ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } rm->data.op_active = 1; rm->m_daddr = daddr; /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) && rs->rs_tos == rs->rs_conn->c_tos) { conn = rs->rs_conn; } else { conn = rds_conn_create_outgoing(sock_net(sock->sk), &rs->rs_bound_addr, &daddr, rs->rs_transport, rs->rs_tos, sock->sk->sk_allocation, scope_id); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } rs->rs_conn = conn; } if (conn->c_trans->t_mp_capable) cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; else cpath = &conn->c_path[0]; rm->m_conn_path = cpath; /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr, &vct); if (ret) goto out; if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", &rm->rdma, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", &rm->atomic, conn->c_trans->xmit_atomic); ret = -EOPNOTSUPP; goto out; } if (rds_destroy_pending(conn)) { ret = -EAGAIN; goto out; } if (rds_conn_path_down(cpath)) rds_check_all_paths(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { rs->rs_seen_congestion = 1; goto out; } while (!rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); if (nonblock) { ret = -EAGAIN; goto out; } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), rds_send_queue_rm(rs, conn, cpath, rm, rs->rs_bound_port, dport, &queued), timeo); rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) continue; ret = timeo; if (ret == 0) ret = -ETIMEDOUT; goto out; } /* * By now we've committed to the send. We reuse rds_send_worker() * to retry sends in the rds thread if the transport asks us to. */ rds_stats_inc(s_send_queued); ret = rds_send_xmit(cpath); if (ret == -ENOMEM || ret == -EAGAIN) { ret = 0; rcu_read_lock(); if (rds_destroy_pending(cpath->cp_conn)) ret = -ENETUNREACH; else queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); rcu_read_unlock(); } if (ret) goto out; rds_message_put(rm); for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); return payload_len; out: for (ind = 0; ind < vct.indx; ind++) kfree(vct.vec[ind].iov); kfree(vct.vec); /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly. * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN * or in any other way, we need to destroy the MR again */ if (allocated_mr) rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); if (rm) rds_message_put(rm); return ret; } /* * send out a probe. Can be shared by rds_send_ping, * rds_send_pong, rds_send_hb. * rds_send_hb should use h_flags * RDS_FLAG_HB_PING|RDS_FLAG_ACK_REQUIRED * or * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ static int rds_send_probe(struct rds_conn_path *cp, __be16 sport, __be16 dport, u8 h_flags) { struct rds_message *rm; unsigned long flags; int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); if (!rm) { ret = -ENOMEM; goto out; } rm->m_daddr = cp->cp_conn->c_faddr; rm->data.op_active = 1; rds_conn_path_connect_if_down(cp); ret = rds_cong_wait(cp->cp_conn->c_fcong, dport, 1, NULL); if (ret) goto out; spin_lock_irqsave(&cp->cp_lock, flags); list_add_tail(&rm->m_conn_item, &cp->cp_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); rds_message_addref(rm); rm->m_inc.i_conn = cp->cp_conn; rm->m_inc.i_conn_path = cp; rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, cp->cp_next_tx_seq); rm->m_inc.i_hdr.h_flags |= h_flags; cp->cp_next_tx_seq++; if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && cp->cp_conn->c_trans->t_mp_capable) { u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, sizeof(npaths)); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, &my_gen_num, sizeof(u32)); } spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); /* schedule the send work on rds_wq */ rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) queue_delayed_work(rds_wq, &cp->cp_send_w, 1); rcu_read_unlock(); rds_message_put(rm); return 0; out: if (rm) rds_message_put(rm); return ret; } int rds_send_pong(struct rds_conn_path *cp, __be16 dport) { return rds_send_probe(cp, 0, dport, 0); } void rds_send_ping(struct rds_connection *conn, int cp_index) { unsigned long flags; struct rds_conn_path *cp = &conn->c_path[cp_index]; spin_lock_irqsave(&cp->cp_lock, flags); if (conn->c_ping_triggered) { spin_unlock_irqrestore(&cp->cp_lock, flags); return; } conn->c_ping_triggered = 1; spin_unlock_irqrestore(&cp->cp_lock, flags); rds_send_probe(cp, cpu_to_be16(RDS_FLAG_PROBE_PORT), 0, 0); } EXPORT_SYMBOL_GPL(rds_send_ping);
4 4 4 4 4 2 2 2 2 58 1 59 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Authors: Sjur Brendeland * Daniel Martensson */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if_ether.h> #include <linux/ip.h> #include <linux/sched.h> #include <linux/sockios.h> #include <linux/caif/if_caif.h> #include <net/rtnetlink.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> /* GPRS PDP connection has MTU to 1500 */ #define GPRS_PDP_MTU 1500 /* 5 sec. connect timeout */ #define CONNECT_TIMEOUT (5 * HZ) #define CAIF_NET_DEFAULT_QUEUE_LEN 500 #define UNDEF_CONNID 0xffffffff /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol GPRS network device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("caif"); enum caif_states { CAIF_CONNECTED = 1, CAIF_CONNECTING, CAIF_DISCONNECTED, CAIF_SHUTDOWN }; struct chnl_net { struct cflayer chnl; struct caif_connect_request conn_req; struct list_head list_field; struct net_device *netdev; wait_queue_head_t netmgmt_wq; /* Flow status to remember and control the transmission. */ bool flowenabled; enum caif_states state; }; static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct sk_buff *skb; struct chnl_net *priv; int pktlen; const u8 *ip_version; u8 buf; priv = container_of(layr, struct chnl_net, chnl); skb = (struct sk_buff *) cfpkt_tonative(pkt); /* Get length of CAIF packet. */ pktlen = skb->len; /* Pass some minimum information and * send the packet to the net stack. */ skb->dev = priv->netdev; /* check the version of IP */ ip_version = skb_header_pointer(skb, 0, 1, &buf); if (!ip_version) { kfree_skb(skb); return -EINVAL; } switch (*ip_version >> 4) { case 4: skb->protocol = htons(ETH_P_IP); break; case 6: skb->protocol = htons(ETH_P_IPV6); break; default: kfree_skb(skb); priv->netdev->stats.rx_errors++; return -EINVAL; } /* If we change the header in loop mode, the checksum is corrupted. */ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) skb->ip_summed = CHECKSUM_UNNECESSARY; else skb->ip_summed = CHECKSUM_NONE; netif_rx(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; priv->netdev->stats.rx_bytes += pktlen; return 0; } static int delete_device(struct chnl_net *dev) { ASSERT_RTNL(); if (dev->netdev) unregister_netdevice(dev->netdev); return 0; } static void close_work(struct work_struct *work) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); if (dev->state == CAIF_SHUTDOWN) dev_close(dev->netdev); } rtnl_unlock(); } static DECLARE_WORK(close_worker, close_work); static void chnl_hold(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_hold(priv->netdev); } static void chnl_put(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_put(priv->netdev); } static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); pr_debug("NET flowctrl func called flow: %s\n", flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" : flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" : flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ? "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND"); switch (flow) { case CAIF_CTRLCMD_FLOW_OFF_IND: priv->flowenabled = false; netif_stop_queue(priv->netdev); break; case CAIF_CTRLCMD_DEINIT_RSP: priv->state = CAIF_DISCONNECTED; break; case CAIF_CTRLCMD_INIT_FAIL_RSP: priv->state = CAIF_DISCONNECTED; wake_up_interruptible(&priv->netmgmt_wq); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: priv->state = CAIF_SHUTDOWN; netif_tx_disable(priv->netdev); schedule_work(&close_worker); break; case CAIF_CTRLCMD_FLOW_ON_IND: priv->flowenabled = true; netif_wake_queue(priv->netdev); break; case CAIF_CTRLCMD_INIT_RSP: caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put); priv->state = CAIF_CONNECTED; priv->flowenabled = true; netif_wake_queue(priv->netdev); wake_up_interruptible(&priv->netmgmt_wq); break; default: break; } } static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct chnl_net *priv; struct cfpkt *pkt = NULL; int len; int result = -1; /* Get our private data. */ priv = netdev_priv(dev); if (skb->len > priv->netdev->mtu) { pr_warn("Size of skb exceeded MTU\n"); kfree_skb(skb); dev->stats.tx_errors++; return NETDEV_TX_OK; } if (!priv->flowenabled) { pr_debug("dropping packets flow off\n"); kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* Store original SKB length. */ len = skb->len; pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb); /* Send the packet down the stack. */ result = priv->chnl.dn->transmit(priv->chnl.dn, pkt); if (result) { dev->stats.tx_dropped++; return NETDEV_TX_OK; } /* Update statistics. */ dev->stats.tx_packets++; dev->stats.tx_bytes += len; return NETDEV_TX_OK; } static int chnl_net_open(struct net_device *dev) { struct chnl_net *priv = NULL; int result = -1; int llifindex, headroom, tailroom, mtu; struct net_device *lldev; ASSERT_RTNL(); priv = netdev_priv(dev); if (!priv) { pr_debug("chnl_net_open: no priv\n"); return -ENODEV; } if (priv->state != CAIF_CONNECTING) { priv->state = CAIF_CONNECTING; result = caif_connect_client(dev_net(dev), &priv->conn_req, &priv->chnl, &llifindex, &headroom, &tailroom); if (result != 0) { pr_debug("err: " "Unable to register and open device," " Err:%d\n", result); goto error; } lldev = __dev_get_by_index(dev_net(dev), llifindex); if (lldev == NULL) { pr_debug("no interface?\n"); result = -ENODEV; goto error; } dev->needed_tailroom = tailroom + lldev->needed_tailroom; dev->hard_header_len = headroom + lldev->hard_header_len + lldev->needed_tailroom; /* * MTU, head-room etc is not know before we have a * CAIF link layer device available. MTU calculation may * override initial RTNL configuration. * MTU is minimum of current mtu, link layer mtu pluss * CAIF head and tail, and PDP GPRS contexts max MTU. */ mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom)); mtu = min_t(int, GPRS_PDP_MTU, mtu); dev_set_mtu(dev, mtu); if (mtu < 100) { pr_warn("CAIF Interface MTU too small (%d)\n", mtu); result = -ENODEV; goto error; } } rtnl_unlock(); /* Release RTNL lock during connect wait */ result = wait_event_interruptible_timeout(priv->netmgmt_wq, priv->state != CAIF_CONNECTING, CONNECT_TIMEOUT); rtnl_lock(); if (result == -ERESTARTSYS) { pr_debug("wait_event_interruptible woken by a signal\n"); result = -ERESTARTSYS; goto error; } if (result == 0) { pr_debug("connect timeout\n"); result = -ETIMEDOUT; goto error; } if (priv->state != CAIF_CONNECTED) { pr_debug("connect failed\n"); result = -ECONNREFUSED; goto error; } pr_debug("CAIF Netdevice connected\n"); return 0; error: caif_disconnect_client(dev_net(dev), &priv->chnl); priv->state = CAIF_DISCONNECTED; pr_debug("state disconnected\n"); return result; } static int chnl_net_stop(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); priv->state = CAIF_DISCONNECTED; caif_disconnect_client(dev_net(dev), &priv->chnl); return 0; } static int chnl_net_init(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); INIT_LIST_HEAD(&priv->list_field); return 0; } static void chnl_net_uninit(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); list_del_init(&priv->list_field); } static const struct net_device_ops netdev_ops = { .ndo_open = chnl_net_open, .ndo_stop = chnl_net_stop, .ndo_init = chnl_net_init, .ndo_uninit = chnl_net_uninit, .ndo_start_xmit = chnl_net_start_xmit, }; static void chnl_net_destructor(struct net_device *dev) { struct chnl_net *priv = netdev_priv(dev); caif_free_client(&priv->chnl); } static void ipcaif_net_setup(struct net_device *dev) { struct chnl_net *priv; dev->netdev_ops = &netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = chnl_net_destructor; dev->flags |= IFF_NOARP; dev->flags |= IFF_POINTOPOINT; dev->mtu = GPRS_PDP_MTU; dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN; priv = netdev_priv(dev); priv->chnl.receive = chnl_recv_cb; priv->chnl.ctrlcmd = chnl_flowctrl_cb; priv->netdev = dev; priv->conn_req.protocol = CAIFPROTO_DATAGRAM; priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW; priv->conn_req.priority = CAIF_PRIO_LOW; /* Insert illegal value */ priv->conn_req.sockaddr.u.dgm.connection_id = UNDEF_CONNID; priv->flowenabled = false; init_waitqueue_head(&priv->netmgmt_wq); } static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct chnl_net *priv; u8 loop; priv = netdev_priv(dev); if (nla_put_u32(skb, IFLA_CAIF_IPV4_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id) || nla_put_u32(skb, IFLA_CAIF_IPV6_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id)) goto nla_put_failure; loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP; if (nla_put_u8(skb, IFLA_CAIF_LOOPBACK, loop)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static void caif_netlink_parms(struct nlattr *data[], struct caif_connect_request *conn_req) { if (!data) { pr_warn("no params data found\n"); return; } if (data[IFLA_CAIF_IPV4_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]); if (data[IFLA_CAIF_IPV6_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]); if (data[IFLA_CAIF_LOOPBACK]) { if (nla_get_u8(data[IFLA_CAIF_LOOPBACK])) conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP; else conn_req->protocol = CAIFPROTO_DATAGRAM; } } static int ipcaif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { int ret; struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); ret = register_netdevice(dev); if (ret) pr_warn("device rtml registration failed\n"); else list_add(&caifdev->list_field, &chnl_net_list); /* Use ifindex as connection id, and use loopback channel default. */ if (caifdev->conn_req.sockaddr.u.dgm.connection_id == UNDEF_CONNID) { caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex; caifdev->conn_req.protocol = CAIFPROTO_DATAGRAM_LOOP; } return ret; } static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); netdev_state_change(dev); return 0; } static size_t ipcaif_get_size(const struct net_device *dev) { return /* IFLA_CAIF_IPV4_CONNID */ nla_total_size(4) + /* IFLA_CAIF_IPV6_CONNID */ nla_total_size(4) + /* IFLA_CAIF_LOOPBACK */ nla_total_size(2) + 0; } static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = { [IFLA_CAIF_IPV4_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_IPV6_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_LOOPBACK] = { .type = NLA_U8 } }; static struct rtnl_link_ops ipcaif_link_ops __read_mostly = { .kind = "caif", .priv_size = sizeof(struct chnl_net), .setup = ipcaif_net_setup, .maxtype = IFLA_CAIF_MAX, .policy = ipcaif_policy, .newlink = ipcaif_newlink, .changelink = ipcaif_changelink, .get_size = ipcaif_get_size, .fill_info = ipcaif_fill_info, }; static int __init chnl_init_module(void) { return rtnl_link_register(&ipcaif_link_ops); } static void __exit chnl_exit_module(void) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_link_unregister(&ipcaif_link_ops); rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); list_del_init(list_node); delete_device(dev); } rtnl_unlock(); } module_init(chnl_init_module); module_exit(chnl_exit_module);
3 1 3 13 13 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/errno.h> #include <linux/list.h> #include <linux/moduleparam.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "netlink.h" char batadv_routing_algo[20] = "BATMAN_IV"; static struct hlist_head batadv_algo_list; /** * batadv_algo_init() - Initialize batman-adv algorithm management data * structures */ void batadv_algo_init(void) { INIT_HLIST_HEAD(&batadv_algo_list); } /** * batadv_algo_get() - Search for algorithm with specific name * @name: algorithm name to find * * Return: Pointer to batadv_algo_ops on success, NULL otherwise */ struct batadv_algo_ops *batadv_algo_get(const char *name) { struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { if (strcmp(bat_algo_ops_tmp->name, name) != 0) continue; bat_algo_ops = bat_algo_ops_tmp; break; } return bat_algo_ops; } /** * batadv_algo_register() - Register callbacks for a mesh algorithm * @bat_algo_ops: mesh algorithm callbacks to add * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); if (bat_algo_ops_tmp) { pr_info("Trying to register already registered routing algorithm: %s\n", bat_algo_ops->name); return -EEXIST; } /* all algorithms must implement all ops (for now) */ if (!bat_algo_ops->iface.enable || !bat_algo_ops->iface.disable || !bat_algo_ops->iface.update_mac || !bat_algo_ops->iface.primary_set || !bat_algo_ops->neigh.cmp || !bat_algo_ops->neigh.is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); return -EINVAL; } INIT_HLIST_NODE(&bat_algo_ops->list); hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); return 0; } /** * batadv_algo_select() - Select algorithm of soft interface * @bat_priv: the bat priv with all the soft interface information * @name: name of the algorithm to select * * The algorithm callbacks for the soft interface will be set when the algorithm * with the correct name was found. Any previous selected algorithm will not be * deinitialized and the new selected algorithm will also not be initialized. * It is therefore not allowed to call batadv_algo_select outside the creation * function of the soft interface. * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_select(struct batadv_priv *bat_priv, const char *name) { struct batadv_algo_ops *bat_algo_ops; bat_algo_ops = batadv_algo_get(name); if (!bat_algo_ops) return -EINVAL; bat_priv->algo_ops = bat_algo_ops; return 0; } static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { struct batadv_algo_ops *bat_algo_ops; char *algo_name = (char *)val; size_t name_len = strlen(algo_name); if (name_len > 0 && algo_name[name_len - 1] == '\n') algo_name[name_len - 1] = '\0'; bat_algo_ops = batadv_algo_get(algo_name); if (!bat_algo_ops) { pr_err("Routing algorithm '%s' is not supported\n", algo_name); return -EINVAL; } return param_set_copystring(algo_name, kp); } static const struct kernel_param_ops batadv_param_ops_ra = { .set = batadv_param_set_ra, .get = param_get_string, }; static struct kparam_string batadv_param_string_ra = { .maxlen = sizeof(batadv_routing_algo), .string = batadv_routing_algo, }; module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 0644); /** * batadv_algo_dump_entry() - fill in information about one supported routing * algorithm * @msg: netlink message to be sent back * @portid: Port to reply to * @seq: Sequence number of message * @bat_algo_ops: Algorithm to be dumped * * Return: Error number, or 0 on success */ static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_algo_ops *bat_algo_ops) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_algo_dump() - fill in information about supported routing * algorithms * @msg: netlink message to be sent back * @cb: Parameters to the netlink request * * Return: Length of reply message. */ int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_algo_ops *bat_algo_ops; int skip = cb->args[0]; int i = 0; hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { if (i++ < skip) continue; if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq, bat_algo_ops)) { i--; break; } } cb->args[0] = i; return msg->len; }
8 8 1 8 8 7 7 7 7 6 2 7 7 7 6 6 5 6 6 6 6 6 6 6 6 7 7 6 6 5 6 8 7 5 6 5 6 6 6 6 6 6 6 6 6 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS segment buffer * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. * */ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/crc32.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include "page.h" #include "segbuf.h" struct nilfs_write_info { struct the_nilfs *nilfs; struct bio *bio; int start, end; /* The region to be submitted */ int rest_blocks; int max_pages; int nr_vecs; sector_t blocknr; }; static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, struct the_nilfs *nilfs); static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf); struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) { struct nilfs_segment_buffer *segbuf; segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS); if (unlikely(!segbuf)) return NULL; segbuf->sb_super = sb; INIT_LIST_HEAD(&segbuf->sb_list); INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); INIT_LIST_HEAD(&segbuf->sb_payload_buffers); segbuf->sb_super_root = NULL; init_completion(&segbuf->sb_bio_event); atomic_set(&segbuf->sb_err, 0); segbuf->sb_nbio = 0; return segbuf; } void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf) { kmem_cache_free(nilfs_segbuf_cachep, segbuf); } void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, unsigned long offset, struct the_nilfs *nilfs) { segbuf->sb_segnum = segnum; nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start, &segbuf->sb_fseg_end); segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset; segbuf->sb_rest_blocks = segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; } /** * nilfs_segbuf_map_cont - map a new log behind a given log * @segbuf: new segment buffer * @prev: segment buffer containing a log to be continued */ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev) { segbuf->sb_segnum = prev->sb_segnum; segbuf->sb_fseg_start = prev->sb_fseg_start; segbuf->sb_fseg_end = prev->sb_fseg_end; segbuf->sb_pseg_start = prev->sb_pseg_start + prev->sb_sum.nblocks; segbuf->sb_rest_blocks = segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; } void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, __u64 nextnum, struct the_nilfs *nilfs) { segbuf->sb_nextnum = nextnum; segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum); } int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) { struct buffer_head *bh; bh = sb_getblk(segbuf->sb_super, segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); } unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, struct buffer_head **bhp) { struct buffer_head *bh; bh = sb_getblk(segbuf->sb_super, segbuf->sb_pseg_start + segbuf->sb_sum.nblocks); if (unlikely(!bh)) return -ENOMEM; nilfs_segbuf_add_payload_buffer(segbuf, bh); *bhp = bh; return 0; } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, time64_t ctime, __u64 cno) { int err; segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0; err = nilfs_segbuf_extend_segsum(segbuf); if (unlikely(err)) return err; segbuf->sb_sum.flags = flags; segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; segbuf->sb_sum.ctime = ctime; segbuf->sb_sum.cno = cno; return 0; } /* * Setup segment summary */ void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) { struct nilfs_segment_summary *raw_sum; struct buffer_head *bh_sum; bh_sum = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, b_assoc_buffers); raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data; raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC); raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum)); raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags); raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq); raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime); raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next); raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks); raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); raw_sum->ss_pad = 0; raw_sum->ss_cno = cpu_to_le64(segbuf->sb_sum.cno); } /* * CRC calculation routines */ static void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; unsigned long size, bytes = segbuf->sb_sum.sumbytes; u32 crc; bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, b_assoc_buffers); raw_sum = (struct nilfs_segment_summary *)bh->b_data; size = min_t(unsigned long, bytes, bh->b_size); crc = crc32_le(seed, (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum), size - (sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum))); list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { bytes -= size; size = min_t(unsigned long, bytes, bh->b_size); crc = crc32_le(crc, bh->b_data, size); } raw_sum->ss_sumsum = cpu_to_le32(crc); } static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct buffer_head *bh; struct nilfs_segment_summary *raw_sum; u32 crc; bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, b_assoc_buffers); raw_sum = (struct nilfs_segment_summary *)bh->b_data; crc = crc32_le(seed, (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum), bh->b_size - sizeof(raw_sum->ss_datasum)); list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { crc = crc32_le(crc, bh->b_data, bh->b_size); } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { size_t offset = offset_in_folio(bh->b_folio, bh->b_data); unsigned char *from; /* Do not support block sizes larger than PAGE_SIZE */ from = kmap_local_folio(bh->b_folio, offset); crc = crc32_le(crc, from, bh->b_size); kunmap_local(from); } raw_sum->ss_datasum = cpu_to_le32(crc); } static void nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct nilfs_super_root *raw_sr; struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; unsigned int srsize; u32 crc; raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; srsize = NILFS_SR_BYTES(nilfs->ns_inode_size); crc = crc32_le(seed, (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), srsize - sizeof(raw_sr->sr_sum)); raw_sr->sr_sum = cpu_to_le32(crc); } static void nilfs_release_buffers(struct list_head *list) { struct buffer_head *bh, *n; list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); brelse(bh); } } static void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) { nilfs_release_buffers(&segbuf->sb_segsum_buffers); nilfs_release_buffers(&segbuf->sb_payload_buffers); segbuf->sb_super_root = NULL; } /* * Iterators for segment buffers */ void nilfs_clear_logs(struct list_head *logs) { struct nilfs_segment_buffer *segbuf; list_for_each_entry(segbuf, logs, sb_list) nilfs_segbuf_clear(segbuf); } void nilfs_truncate_logs(struct list_head *logs, struct nilfs_segment_buffer *last) { struct nilfs_segment_buffer *n, *segbuf; segbuf = list_prepare_entry(last, logs, sb_list); list_for_each_entry_safe_continue(segbuf, n, logs, sb_list) { list_del_init(&segbuf->sb_list); nilfs_segbuf_clear(segbuf); nilfs_segbuf_free(segbuf); } } int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs) { struct nilfs_segment_buffer *segbuf; int ret = 0; list_for_each_entry(segbuf, logs, sb_list) { ret = nilfs_segbuf_write(segbuf, nilfs); if (ret) break; } return ret; } int nilfs_wait_on_logs(struct list_head *logs) { struct nilfs_segment_buffer *segbuf; int err, ret = 0; list_for_each_entry(segbuf, logs, sb_list) { err = nilfs_segbuf_wait(segbuf); if (err && !ret) ret = err; } return ret; } /** * nilfs_add_checksums_on_logs - add checksums on the logs * @logs: list of segment buffers storing target logs * @seed: checksum seed value */ void nilfs_add_checksums_on_logs(struct list_head *logs, u32 seed) { struct nilfs_segment_buffer *segbuf; list_for_each_entry(segbuf, logs, sb_list) { if (segbuf->sb_super_root) nilfs_segbuf_fill_in_super_root_crc(segbuf, seed); nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); nilfs_segbuf_fill_in_data_crc(segbuf, seed); } } /* * BIO operations */ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); complete(&segbuf->sb_bio_event); } static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi) { struct bio *bio = wi->bio; bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; submit_bio(bio); segbuf->sb_nbio++; wi->bio = NULL; wi->rest_blocks -= wi->end - wi->start; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end; return 0; } static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi) { wi->bio = NULL; wi->rest_blocks = segbuf->sb_sum.nblocks; wi->max_pages = BIO_MAX_VECS; wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); wi->start = wi->end = 0; wi->blocknr = segbuf->sb_pseg_start; } static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, struct nilfs_write_info *wi, struct buffer_head *bh) { int err; BUG_ON(wi->nr_vecs <= 0); repeat: if (!wi->bio) { wi->bio = bio_alloc(wi->nilfs->ns_bdev, wi->nr_vecs, REQ_OP_WRITE, GFP_NOIO); wi->bio->bi_iter.bi_sector = (wi->blocknr + wi->end) << (wi->nilfs->ns_blocksize_bits - 9); } if (bio_add_folio(wi->bio, bh->b_folio, bh->b_size, offset_in_folio(bh->b_folio, bh->b_data))) { wi->end++; return 0; } /* bio is FULL */ err = nilfs_segbuf_submit_bio(segbuf, wi); /* never submit current bh */ if (likely(!err)) goto repeat; return err; } /** * nilfs_segbuf_write - submit write requests of a log * @segbuf: buffer storing a log to be written * @nilfs: nilfs object * * Return: Always 0. */ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, struct the_nilfs *nilfs) { struct nilfs_write_info wi; struct buffer_head *bh; int res = 0; wi.nilfs = nilfs; nilfs_segbuf_prepare_write(segbuf, &wi); list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { res = nilfs_segbuf_submit_bh(segbuf, &wi, bh); if (unlikely(res)) goto failed_bio; } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { res = nilfs_segbuf_submit_bh(segbuf, &wi, bh); if (unlikely(res)) goto failed_bio; } if (wi.bio) { /* * Last BIO is always sent through the following * submission. */ wi.bio->bi_opf |= REQ_SYNC; res = nilfs_segbuf_submit_bio(segbuf, &wi); } failed_bio: return res; } /** * nilfs_segbuf_wait - wait for completion of requested BIOs * @segbuf: segment buffer * * Return: 0 on success, or %-EIO if I/O error is detected. */ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) { int err = 0; if (!segbuf->sb_nbio) return 0; do { wait_for_completion(&segbuf->sb_bio_event); } while (--segbuf->sb_nbio > 0); if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { nilfs_err(segbuf->sb_super, "I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu", (unsigned long long)segbuf->sb_pseg_start, segbuf->sb_sum.nblocks, (unsigned long long)segbuf->sb_segnum); err = -EIO; } return err; }
19 79 1 78 52 54 19 80 29 10 18 69 19 19 18 19 7 19 53 58 80 19 1 78 81 88 88 9 81 21 5 17 17 16 7 15 18 44 57 23 4 42 3 3 50 67 24 58 29 6 27 5 2 7 2 2 14 61 61 23 20 31 20 31 61 23 51 6 3 32 18 18 14 15 19 19 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 // SPDX-License-Identifier: GPL-2.0-or-later /* * Symmetric key cipher operations. * * Generic encrypt/decrypt wrapper for ciphers, handles operations across * multiple page boundaries by using temporary blocks. In user context, * the kernel is given a chance to schedule us once per page. * * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/bug.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <net/netlink.h> #include "skcipher.h" #define CRYPTO_ALG_TYPE_SKCIPHER_MASK 0x0000000e enum { SKCIPHER_WALK_SLOW = 1 << 0, SKCIPHER_WALK_COPY = 1 << 1, SKCIPHER_WALK_DIFF = 1 << 2, SKCIPHER_WALK_SLEEP = 1 << 3, }; static const struct crypto_type crypto_skcipher_type; static int skcipher_walk_next(struct skcipher_walk *walk); static inline void skcipher_map_src(struct skcipher_walk *walk) { walk->src.virt.addr = scatterwalk_map(&walk->in); } static inline void skcipher_map_dst(struct skcipher_walk *walk) { walk->dst.virt.addr = scatterwalk_map(&walk->out); } static inline void skcipher_unmap_src(struct skcipher_walk *walk) { scatterwalk_unmap(walk->src.virt.addr); } static inline void skcipher_unmap_dst(struct skcipher_walk *walk) { scatterwalk_unmap(walk->dst.virt.addr); } static inline gfp_t skcipher_walk_gfp(struct skcipher_walk *walk) { return walk->flags & SKCIPHER_WALK_SLEEP ? GFP_KERNEL : GFP_ATOMIC; } static inline struct skcipher_alg *__crypto_skcipher_alg( struct crypto_alg *alg) { return container_of(alg, struct skcipher_alg, base); } static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize) { u8 *addr = PTR_ALIGN(walk->buffer, walk->alignmask + 1); scatterwalk_copychunks(addr, &walk->out, bsize, 1); return 0; } /** * skcipher_walk_done() - finish one step of a skcipher_walk * @walk: the skcipher_walk * @res: number of bytes *not* processed (>= 0) from walk->nbytes, * or a -errno value to terminate the walk due to an error * * This function cleans up after one step of walking through the source and * destination scatterlists, and advances to the next step if applicable. * walk->nbytes is set to the number of bytes available in the next step, * walk->total is set to the new total number of bytes remaining, and * walk->{src,dst}.virt.addr is set to the next pair of data pointers. If there * is no more data, or if an error occurred (i.e. -errno return), then * walk->nbytes and walk->total are set to 0 and all resources owned by the * skcipher_walk are freed. * * Return: 0 or a -errno value. If @res was a -errno value then it will be * returned, but other errors may occur too. */ int skcipher_walk_done(struct skcipher_walk *walk, int res) { unsigned int n = walk->nbytes; /* num bytes processed this step */ unsigned int total = 0; /* new total remaining */ if (!n) goto finish; if (likely(res >= 0)) { n -= res; /* subtract num bytes *not* processed */ total = walk->total - n; } if (likely(!(walk->flags & (SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF)))) { unmap_src: skcipher_unmap_src(walk); } else if (walk->flags & SKCIPHER_WALK_DIFF) { skcipher_unmap_dst(walk); goto unmap_src; } else if (walk->flags & SKCIPHER_WALK_COPY) { skcipher_map_dst(walk); memcpy(walk->dst.virt.addr, walk->page, n); skcipher_unmap_dst(walk); } else { /* SKCIPHER_WALK_SLOW */ if (res > 0) { /* * Didn't process all bytes. Either the algorithm is * broken, or this was the last step and it turned out * the message wasn't evenly divisible into blocks but * the algorithm requires it. */ res = -EINVAL; total = 0; } else n = skcipher_done_slow(walk, n); } if (res > 0) res = 0; walk->total = total; walk->nbytes = 0; scatterwalk_advance(&walk->in, n); scatterwalk_advance(&walk->out, n); scatterwalk_done(&walk->in, 0, total); scatterwalk_done(&walk->out, 1, total); if (total) { if (walk->flags & SKCIPHER_WALK_SLEEP) cond_resched(); walk->flags &= ~(SKCIPHER_WALK_SLOW | SKCIPHER_WALK_COPY | SKCIPHER_WALK_DIFF); return skcipher_walk_next(walk); } finish: /* Short-circuit for the common/fast path. */ if (!((unsigned long)walk->buffer | (unsigned long)walk->page)) goto out; if (walk->iv != walk->oiv) memcpy(walk->oiv, walk->iv, walk->ivsize); if (walk->buffer != walk->page) kfree(walk->buffer); if (walk->page) free_page((unsigned long)walk->page); out: return res; } EXPORT_SYMBOL_GPL(skcipher_walk_done); static int skcipher_next_slow(struct skcipher_walk *walk, unsigned int bsize) { unsigned alignmask = walk->alignmask; unsigned n; u8 *buffer; if (!walk->buffer) walk->buffer = walk->page; buffer = walk->buffer; if (!buffer) { /* Min size for a buffer of bsize bytes aligned to alignmask */ n = bsize + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); buffer = kzalloc(n, skcipher_walk_gfp(walk)); if (!buffer) return skcipher_walk_done(walk, -ENOMEM); walk->buffer = buffer; } walk->dst.virt.addr = PTR_ALIGN(buffer, alignmask + 1); walk->src.virt.addr = walk->dst.virt.addr; scatterwalk_copychunks(walk->src.virt.addr, &walk->in, bsize, 0); walk->nbytes = bsize; walk->flags |= SKCIPHER_WALK_SLOW; return 0; } static int skcipher_next_copy(struct skcipher_walk *walk) { u8 *tmp = walk->page; skcipher_map_src(walk); memcpy(tmp, walk->src.virt.addr, walk->nbytes); skcipher_unmap_src(walk); walk->src.virt.addr = tmp; walk->dst.virt.addr = tmp; return 0; } static int skcipher_next_fast(struct skcipher_walk *walk) { unsigned long diff; diff = offset_in_page(walk->in.offset) - offset_in_page(walk->out.offset); diff |= (u8 *)scatterwalk_page(&walk->in) - (u8 *)scatterwalk_page(&walk->out); skcipher_map_src(walk); walk->dst.virt.addr = walk->src.virt.addr; if (diff) { walk->flags |= SKCIPHER_WALK_DIFF; skcipher_map_dst(walk); } return 0; } static int skcipher_walk_next(struct skcipher_walk *walk) { unsigned int bsize; unsigned int n; n = walk->total; bsize = min(walk->stride, max(n, walk->blocksize)); n = scatterwalk_clamp(&walk->in, n); n = scatterwalk_clamp(&walk->out, n); if (unlikely(n < bsize)) { if (unlikely(walk->total < walk->blocksize)) return skcipher_walk_done(walk, -EINVAL); slow_path: return skcipher_next_slow(walk, bsize); } walk->nbytes = n; if (unlikely((walk->in.offset | walk->out.offset) & walk->alignmask)) { if (!walk->page) { gfp_t gfp = skcipher_walk_gfp(walk); walk->page = (void *)__get_free_page(gfp); if (!walk->page) goto slow_path; } walk->flags |= SKCIPHER_WALK_COPY; return skcipher_next_copy(walk); } return skcipher_next_fast(walk); } static int skcipher_copy_iv(struct skcipher_walk *walk) { unsigned alignmask = walk->alignmask; unsigned ivsize = walk->ivsize; unsigned aligned_stride = ALIGN(walk->stride, alignmask + 1); unsigned size; u8 *iv; /* Min size for a buffer of stride + ivsize, aligned to alignmask */ size = aligned_stride + ivsize + (alignmask & ~(crypto_tfm_ctx_alignment() - 1)); walk->buffer = kmalloc(size, skcipher_walk_gfp(walk)); if (!walk->buffer) return -ENOMEM; iv = PTR_ALIGN(walk->buffer, alignmask + 1) + aligned_stride; walk->iv = memcpy(iv, walk->iv, walk->ivsize); return 0; } static int skcipher_walk_first(struct skcipher_walk *walk) { if (WARN_ON_ONCE(in_hardirq())) return -EDEADLK; walk->buffer = NULL; if (unlikely(((unsigned long)walk->iv & walk->alignmask))) { int err = skcipher_copy_iv(walk); if (err) return err; } walk->page = NULL; return skcipher_walk_next(walk); } int skcipher_walk_virt(struct skcipher_walk *walk, struct skcipher_request *req, bool atomic) { const struct skcipher_alg *alg = crypto_skcipher_alg(crypto_skcipher_reqtfm(req)); might_sleep_if(req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP); walk->total = req->cryptlen; walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic) walk->flags = SKCIPHER_WALK_SLEEP; else walk->flags = 0; if (unlikely(!walk->total)) return 0; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); /* * Accessing 'alg' directly generates better code than using the * crypto_skcipher_blocksize() and similar helper functions here, as it * prevents the algorithm pointer from being repeatedly reloaded. */ walk->blocksize = alg->base.cra_blocksize; walk->ivsize = alg->co.ivsize; walk->alignmask = alg->base.cra_alignmask; if (alg->co.base.cra_type != &crypto_skcipher_type) walk->stride = alg->co.chunksize; else walk->stride = alg->walksize; return skcipher_walk_first(walk); } EXPORT_SYMBOL_GPL(skcipher_walk_virt); static int skcipher_walk_aead_common(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { const struct aead_alg *alg = crypto_aead_alg(crypto_aead_reqtfm(req)); walk->nbytes = 0; walk->iv = req->iv; walk->oiv = req->iv; if ((req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) && !atomic) walk->flags = SKCIPHER_WALK_SLEEP; else walk->flags = 0; if (unlikely(!walk->total)) return 0; scatterwalk_start(&walk->in, req->src); scatterwalk_start(&walk->out, req->dst); scatterwalk_copychunks(NULL, &walk->in, req->assoclen, 2); scatterwalk_copychunks(NULL, &walk->out, req->assoclen, 2); scatterwalk_done(&walk->in, 0, walk->total); scatterwalk_done(&walk->out, 0, walk->total); /* * Accessing 'alg' directly generates better code than using the * crypto_aead_blocksize() and similar helper functions here, as it * prevents the algorithm pointer from being repeatedly reloaded. */ walk->blocksize = alg->base.cra_blocksize; walk->stride = alg->chunksize; walk->ivsize = alg->ivsize; walk->alignmask = alg->base.cra_alignmask; return skcipher_walk_first(walk); } int skcipher_walk_aead_encrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { walk->total = req->cryptlen; return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_encrypt); int skcipher_walk_aead_decrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); walk->total = req->cryptlen - crypto_aead_authsize(tfm); return skcipher_walk_aead_common(walk, req, atomic); } EXPORT_SYMBOL_GPL(skcipher_walk_aead_decrypt); static void skcipher_set_needkey(struct crypto_skcipher *tfm) { if (crypto_skcipher_max_keysize(tfm) != 0) crypto_skcipher_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } static int skcipher_setkey_unaligned(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { unsigned long alignmask = crypto_skcipher_alignmask(tfm); struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); u8 *buffer, *alignbuffer; unsigned long absize; int ret; absize = keylen + alignmask; buffer = kmalloc(absize, GFP_ATOMIC); if (!buffer) return -ENOMEM; alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1); memcpy(alignbuffer, key, keylen); ret = cipher->setkey(tfm, alignbuffer, keylen); kfree_sensitive(buffer); return ret; } int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct skcipher_alg *cipher = crypto_skcipher_alg(tfm); unsigned long alignmask = crypto_skcipher_alignmask(tfm); int err; if (cipher->co.base.cra_type != &crypto_skcipher_type) { struct crypto_lskcipher **ctx = crypto_skcipher_ctx(tfm); crypto_lskcipher_clear_flags(*ctx, CRYPTO_TFM_REQ_MASK); crypto_lskcipher_set_flags(*ctx, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_lskcipher_setkey(*ctx, key, keylen); goto out; } if (keylen < cipher->min_keysize || keylen > cipher->max_keysize) return -EINVAL; if ((unsigned long)key & alignmask) err = skcipher_setkey_unaligned(tfm, key, keylen); else err = cipher->setkey(tfm, key, keylen); out: if (unlikely(err)) { skcipher_set_needkey(tfm); return err; } crypto_skcipher_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_skcipher_setkey); int crypto_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_encrypt_sg(req); return alg->encrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt); int crypto_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_decrypt_sg(req); return alg->decrypt(req); } EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt); static int crypto_lskcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(out, ivs + crypto_skcipher_ivsize(tfm), crypto_skcipher_statesize(tfm)); return 0; } static int crypto_lskcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); u8 *ivs = skcipher_request_ctx(req); ivs = PTR_ALIGN(ivs, crypto_skcipher_alignmask(tfm) + 1); memcpy(ivs + crypto_skcipher_ivsize(tfm), in, crypto_skcipher_statesize(tfm)); return 0; } static int skcipher_noexport(struct skcipher_request *req, void *out) { return 0; } static int skcipher_noimport(struct skcipher_request *req, const void *in) { return 0; } int crypto_skcipher_export(struct skcipher_request *req, void *out) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_export(req, out); return alg->export(req, out); } EXPORT_SYMBOL_GPL(crypto_skcipher_export); int crypto_skcipher_import(struct skcipher_request *req, const void *in) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_alg *alg = crypto_skcipher_alg(tfm); if (alg->co.base.cra_type != &crypto_skcipher_type) return crypto_lskcipher_import(req, in); return alg->import(req, in); } EXPORT_SYMBOL_GPL(crypto_skcipher_import); static void crypto_skcipher_exit_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); alg->exit(skcipher); } static int crypto_skcipher_init_tfm(struct crypto_tfm *tfm) { struct crypto_skcipher *skcipher = __crypto_skcipher_cast(tfm); struct skcipher_alg *alg = crypto_skcipher_alg(skcipher); skcipher_set_needkey(skcipher); if (tfm->__crt_alg->cra_type != &crypto_skcipher_type) { unsigned am = crypto_skcipher_alignmask(skcipher); unsigned reqsize; reqsize = am & ~(crypto_tfm_ctx_alignment() - 1); reqsize += crypto_skcipher_ivsize(skcipher); reqsize += crypto_skcipher_statesize(skcipher); crypto_skcipher_set_reqsize(skcipher, reqsize); return crypto_init_lskcipher_ops_sg(tfm); } if (alg->exit) skcipher->base.exit = crypto_skcipher_exit_tfm; if (alg->init) return alg->init(skcipher); return 0; } static unsigned int crypto_skcipher_extsize(struct crypto_alg *alg) { if (alg->cra_type != &crypto_skcipher_type) return sizeof(struct crypto_lskcipher *); return crypto_alg_extsize(alg); } static void crypto_skcipher_free_instance(struct crypto_instance *inst) { struct skcipher_instance *skcipher = container_of(inst, struct skcipher_instance, s.base); skcipher->free(skcipher); } static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); seq_printf(m, "type : skcipher\n"); seq_printf(m, "async : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ? "yes" : "no"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", skcipher->min_keysize); seq_printf(m, "max keysize : %u\n", skcipher->max_keysize); seq_printf(m, "ivsize : %u\n", skcipher->ivsize); seq_printf(m, "chunksize : %u\n", skcipher->chunksize); seq_printf(m, "walksize : %u\n", skcipher->walksize); seq_printf(m, "statesize : %u\n", skcipher->statesize); } static int __maybe_unused crypto_skcipher_report( struct sk_buff *skb, struct crypto_alg *alg) { struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg); struct crypto_report_blkcipher rblkcipher; memset(&rblkcipher, 0, sizeof(rblkcipher)); strscpy(rblkcipher.type, "skcipher", sizeof(rblkcipher.type)); strscpy(rblkcipher.geniv, "<none>", sizeof(rblkcipher.geniv)); rblkcipher.blocksize = alg->cra_blocksize; rblkcipher.min_keysize = skcipher->min_keysize; rblkcipher.max_keysize = skcipher->max_keysize; rblkcipher.ivsize = skcipher->ivsize; return nla_put(skb, CRYPTOCFGA_REPORT_BLKCIPHER, sizeof(rblkcipher), &rblkcipher); } static const struct crypto_type crypto_skcipher_type = { .extsize = crypto_skcipher_extsize, .init_tfm = crypto_skcipher_init_tfm, .free = crypto_skcipher_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_skcipher_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_skcipher_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK, .type = CRYPTO_ALG_TYPE_SKCIPHER, .tfmsize = offsetof(struct crypto_skcipher, base), }; int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_skcipher_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_skcipher); struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_skcipher); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher( const char *alg_name, u32 type, u32 mask) { struct crypto_skcipher *tfm; /* Only sync algorithms allowed. */ mask |= CRYPTO_ALG_ASYNC | CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE; tfm = crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask); /* * Make sure we do not allocate something that might get used with * an on-stack request: check the request size. */ if (!IS_ERR(tfm) && WARN_ON(crypto_skcipher_reqsize(tfm) > MAX_SYNC_SKCIPHER_REQSIZE)) { crypto_free_skcipher(tfm); return ERR_PTR(-EINVAL); } return (struct crypto_sync_skcipher *)tfm; } EXPORT_SYMBOL_GPL(crypto_alloc_sync_skcipher); int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_skcipher_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_skcipher); int skcipher_prepare_alg_common(struct skcipher_alg_common *alg) { struct crypto_alg *base = &alg->base; if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 || alg->statesize > PAGE_SIZE / 2 || (alg->ivsize + alg->statesize) > PAGE_SIZE / 2) return -EINVAL; if (!alg->chunksize) alg->chunksize = base->cra_blocksize; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; return 0; } static int skcipher_prepare_alg(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg_common(&alg->co); if (err) return err; if (alg->walksize > PAGE_SIZE / 8) return -EINVAL; if (!alg->walksize) alg->walksize = alg->chunksize; if (!alg->statesize) { alg->import = skcipher_noimport; alg->export = skcipher_noexport; } else if (!(alg->import && alg->export)) return -EINVAL; base->cra_type = &crypto_skcipher_type; base->cra_flags |= CRYPTO_ALG_TYPE_SKCIPHER; return 0; } int crypto_register_skcipher(struct skcipher_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = skcipher_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_skcipher); void crypto_unregister_skcipher(struct skcipher_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_skcipher); int crypto_register_skciphers(struct skcipher_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_skcipher(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_skciphers); void crypto_unregister_skciphers(struct skcipher_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_skcipher(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_skciphers); int skcipher_register_instance(struct crypto_template *tmpl, struct skcipher_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = skcipher_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, skcipher_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(skcipher_register_instance); static int skcipher_setkey_simple(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); crypto_cipher_clear_flags(cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(cipher, key, keylen); } static int skcipher_init_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst); struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->cipher = cipher; return 0; } static void skcipher_exit_tfm_simple(struct crypto_skcipher *tfm) { struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); crypto_free_cipher(ctx->cipher); } static void skcipher_free_instance_simple(struct skcipher_instance *inst) { crypto_drop_cipher(skcipher_instance_ctx(inst)); kfree(inst); } /** * skcipher_alloc_instance_simple - allocate instance of simple block cipher mode * * Allocate an skcipher_instance for a simple block cipher mode of operation, * e.g. cbc or ecb. The instance context will have just a single crypto_spawn, * that for the underlying cipher. The {min,max}_keysize, ivsize, blocksize, * alignmask, and priority are set from the underlying cipher but can be * overridden if needed. The tfm context defaults to skcipher_ctx_simple, and * default ->setkey(), ->init(), and ->exit() methods are installed. * * @tmpl: the template being instantiated * @tb: the template parameters * * Return: a pointer to the new instance, or an ERR_PTR(). The caller still * needs to register the instance. */ struct skcipher_instance *skcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct skcipher_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *cipher_alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return ERR_PTR(err); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return ERR_PTR(-ENOMEM); spawn = skcipher_instance_ctx(inst); err = crypto_grab_cipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; cipher_alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(skcipher_crypto_instance(inst), tmpl->name, cipher_alg); if (err) goto err_free_inst; inst->free = skcipher_free_instance_simple; /* Default algorithm properties, can be overridden */ inst->alg.base.cra_blocksize = cipher_alg->cra_blocksize; inst->alg.base.cra_alignmask = cipher_alg->cra_alignmask; inst->alg.base.cra_priority = cipher_alg->cra_priority; inst->alg.min_keysize = cipher_alg->cra_cipher.cia_min_keysize; inst->alg.max_keysize = cipher_alg->cra_cipher.cia_max_keysize; inst->alg.ivsize = cipher_alg->cra_blocksize; /* Use skcipher_ctx_simple by default, can be overridden */ inst->alg.base.cra_ctxsize = sizeof(struct skcipher_ctx_simple); inst->alg.setkey = skcipher_setkey_simple; inst->alg.init = skcipher_init_tfm_simple; inst->alg.exit = skcipher_exit_tfm_simple; return inst; err_free_inst: skcipher_free_instance_simple(inst); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(skcipher_alloc_instance_simple); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Symmetric key cipher type"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
15 15 2 13 15 1 1 15 9 9 7 7 7 7 5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct linkinfo_req_info { struct ethnl_req_info base; }; struct linkinfo_reply_data { struct ethnl_reply_data base; struct ethtool_link_ksettings ksettings; struct ethtool_link_settings *lsettings; }; #define LINKINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct linkinfo_reply_data, base) const struct nla_policy ethnl_linkinfo_get_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; data->lsettings = &data->ksettings.base; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); if (ret < 0) GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); ethnl_ops_complete(dev); return ret; } static int linkinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + 0; } static int linkinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, data->lsettings->phy_address) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, data->lsettings->eth_tp_mdix) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, data->lsettings->eth_tp_mdix_ctrl) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, data->lsettings->transceiver)) return -EMSGSIZE; return 0; } /* LINKINFO_SET */ const struct nla_policy ethnl_linkinfo_set_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, }; static int ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!ops->get_link_ksettings || !ops->set_link_ksettings) return -EOPNOTSUPP; return 1; } static int ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct ethtool_link_settings *lsettings; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); return ret; } lsettings = &ksettings.base; ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], &mod); ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); return ret; } return 1; } const struct ethnl_request_ops ethnl_linkinfo_request_ops = { .request_cmd = ETHTOOL_MSG_LINKINFO_GET, .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, .req_info_size = sizeof(struct linkinfo_req_info), .reply_data_size = sizeof(struct linkinfo_reply_data), .prepare_data = linkinfo_prepare_data, .reply_size = linkinfo_reply_size, .fill_reply = linkinfo_fill_reply, .set_validate = ethnl_set_linkinfo_validate, .set = ethnl_set_linkinfo, .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF, };
25 25 7 11 11 11 11 25 25 25 31 31 16 9 31 31 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include "percpu_freelist.h" int pcpu_freelist_init(struct pcpu_freelist *s) { int cpu; s->freelist = alloc_percpu(struct pcpu_freelist_head); if (!s->freelist) return -ENOMEM; for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); raw_spin_lock_init(&head->lock); head->first = NULL; } raw_spin_lock_init(&s->extralist.lock); s->extralist.first = NULL; return 0; } void pcpu_freelist_destroy(struct pcpu_freelist *s) { free_percpu(s->freelist); } static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { node->next = head->first; WRITE_ONCE(head->first, node); } static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); } static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { if (!raw_spin_trylock(&s->extralist.lock)) return false; pcpu_freelist_push_node(&s->extralist, node); raw_spin_unlock(&s->extralist.lock); return true; } static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { int cpu, orig_cpu; orig_cpu = raw_smp_processor_id(); while (1) { for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { struct pcpu_freelist_head *head; head = per_cpu_ptr(s->freelist, cpu); if (raw_spin_trylock(&head->lock)) { pcpu_freelist_push_node(head, node); raw_spin_unlock(&head->lock); return; } } /* cannot lock any per cpu lock, try extralist */ if (pcpu_freelist_try_push_extra(s, node)) return; } } void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { if (in_nmi()) ___pcpu_freelist_push_nmi(s, node); else ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); } void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { unsigned long flags; local_irq_save(flags); __pcpu_freelist_push(s, node); local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; unsigned int cpu, cpu_idx, i, j, n, m; n = nr_elems / num_possible_cpus(); m = nr_elems % num_possible_cpus(); cpu_idx = 0; for_each_possible_cpu(cpu) { head = per_cpu_ptr(s->freelist, cpu); j = n + (cpu_idx < m ? 1 : 0); for (i = 0; i < j; i++) { /* No locking required as this is not visible yet. */ pcpu_freelist_push_node(head, buf); buf += elem_size; } cpu_idx++; } } static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; raw_spin_lock(&head->lock); node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } /* per cpu lists are all empty, try extralist */ if (!READ_ONCE(s->extralist.first)) return NULL; raw_spin_lock(&s->extralist.lock); node = s->extralist.first; if (node) WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } static struct pcpu_freelist_node * ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; if (raw_spin_trylock(&head->lock)) { node = head->first; if (node) { WRITE_ONCE(head->first, node->next); raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); } } /* cannot pop from per cpu lists, try extralist */ if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) return NULL; node = s->extralist.first; if (node) WRITE_ONCE(s->extralist.first, node->next); raw_spin_unlock(&s->extralist.lock); return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { if (in_nmi()) return ___pcpu_freelist_pop_nmi(s); return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_node *ret; unsigned long flags; local_irq_save(flags); ret = __pcpu_freelist_pop(s); local_irq_restore(flags); return ret; }
35 1603 706 3837 3946 3953 8 87 5938 5933 5932 5826 5830 5822 103 1421 34 34 3535 1839 1709 2367 1186 2647 923 2990 692 2558 2555 2558 1318 1321 1321 49 49 2801 2811 2804 2751 55 30 1365 2127 2578 375 2797 2804 2797 10 2817 1980 45110 45649 1600 1975 1418 1421 1407 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/swap.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ static struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; union { struct path user_path; freeptr_t bf_freeptr; }; }; static inline struct backing_file *backing_file(struct file *f) { return container_of(f, struct backing_file, file); } struct path *backing_file_user_path(struct file *f) { return &backing_file(f)->user_path; } EXPORT_SYMBOL_GPL(backing_file_user_path); static inline void file_free(struct file *f) { security_file_free(f); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Handle nr_files sysctl */ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = SYSCTL_LONG_ZERO, .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, }; static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } return 0; } fs_initcall(init_fs_stat_sysctls); #endif static int init_file(struct file *f, int flags, const struct cred *cred) { int error; f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { put_cred(f->f_cred); return error; } spin_lock_init(&f->f_lock); /* * Note that f_pos_lock is only used for files raising * FMODE_ATOMIC_POS and directories. Other files such as pipes * don't need it and since f_pos_lock is in a union may reuse * the space for other purposes. They are expected to initialize * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); memset(&f->f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); f->f_op = NULL; f->f_mapping = NULL; f->private_data = NULL; f->f_inode = NULL; f->f_owner = NULL; #ifdef CONFIG_EPOLL f->f_ep = NULL; #endif f->f_iocb_flags = 0; f->f_pos = 0; f->f_wb_err = 0; f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); /* * Disable permission and pre-content events for all files by default. * They may be enabled later by file_set_fsnotify_mode_from_watchers(). */ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; int error; /* * Privileged users can go above max_files */ if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } f->f_mode |= FMODE_NOACCOUNT; return f; } /* * Variant of alloc_empty_file() that allocates a backing_file container * and doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) { struct backing_file *ff; int error; ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; return &ff->file; } /** * file_init_path - initialize a 'struct file' based on path * * @file: the file to set up * @path: the (dentry, vfsmount) pair for the new file * @fop: the 'struct file_operations' for the new file */ static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if (fop->llseek) file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (!IS_ERR(file)) file_init_path(file, path, fop); return file; } static inline int alloc_path_pseudo(const char *name, struct inode *inode, struct vfsmount *mnt, struct path *path) { path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); if (!path->dentry) return -ENOMEM; path->mnt = mntget(mnt); d_instantiate(path->dentry, inode); return 0; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); return file; } /* * Disable all fsnotify events for pseudo files by default. * They may be enabled by caller with file_set_fsnotify_mode(). */ file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_pseudo_noaccount(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_empty_file_noaccount(flags, current_cred()); if (IS_ERR(file)) { ihold(inode); path_put(&path); return file; } file_init_path(file, &path, fops); /* * Disable all fsnotify events for pseudo files by default. * They may be enabled by caller with file_set_fsnotify_mode(). */ file_set_fsnotify_mode(file, FMODE_NONOTIFY); return file; } EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f; f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); security_file_release(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_task_work)); } static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); void fput(struct file *file) { if (file_ref_put(&file->f_ref)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { file_free(file); return; } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_task_work, ____fput); if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (file_ref_put(&file->f_ref)) __fput(file); } EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct file, f_freeptr), }; filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); }
1 2 3 7 3 8 7 1 14 14 2 1 7 7 1 6 1 5 6 1 1 2 2 5 7 1 1 8 1 1 6 5 1 2 3 4 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "rsrc.h" #include "openclose.h" struct io_open { struct file *file; int dfd; u32 file_slot; struct filename *filename; struct open_how how; unsigned long nofile; }; struct io_close { struct file *file; int fd; u32 file_slot; }; struct io_fixed_install { struct file *file; unsigned int o_flags; }; static bool io_openat_force_async(struct io_open *open) { /* * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, * it'll always -EAGAIN. Note that we test for __O_TMPFILE because * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force * async for. */ return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE); } static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); const char __user *fname; int ret; if (unlikely(sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; /* open.how should be already initialised */ if (!(open->how.flags & O_PATH) && force_o_largefile()) open->how.flags |= O_LARGEFILE; open->dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); open->filename = getname(fname); if (IS_ERR(open->filename)) { ret = PTR_ERR(open->filename); open->filename = NULL; return ret; } open->file_slot = READ_ONCE(sqe->file_index); if (open->file_slot && (open->how.flags & O_CLOEXEC)) return -EINVAL; open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; if (io_openat_force_async(open)) req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); u64 mode = READ_ONCE(sqe->len); u64 flags = READ_ONCE(sqe->open_flags); open->how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); struct open_how __user *how; size_t len; int ret; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); if (ret) return ret; return __io_openat_prep(req, sqe); } int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); struct open_flags op; struct file *file; bool resolve_nonblock, nonblock_set; bool fixed = !!open->file_slot; int ret; ret = build_open_flags(&open->how, &op); if (ret) goto err; nonblock_set = op.open_flag & O_NONBLOCK; resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { WARN_ON_ONCE(io_openat_force_async(open)); op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } if (!fixed) { ret = __get_unused_fd_flags(open->how.flags, open->nofile); if (ret < 0) goto err; } file = do_filp_open(open->dfd, open->filename, &op); if (IS_ERR(file)) { /* * We could hang on to this 'fd' on retrying, but seems like * marginal gain for something that is now known to be a slower * path. So just put it, and we'll get a new one when we retry. */ if (!fixed) put_unused_fd(ret); ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ if (ret == -EAGAIN && (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) return -EAGAIN; goto err; } if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; if (!fixed) fd_install(ret, file); else ret = io_fixed_fd_install(req, issue_flags, file, open->file_slot); err: putname(open->filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } int io_openat(struct io_kiocb *req, unsigned int issue_flags) { return io_openat2(req, issue_flags); } void io_open_cleanup(struct io_kiocb *req) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); if (open->filename) putname(open->filename); } int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset) { int ret; io_ring_submit_lock(ctx, issue_flags); ret = io_fixed_fd_remove(ctx, offset); io_ring_submit_unlock(ctx, issue_flags); return ret; } static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { struct io_close *close = io_kiocb_to_cmd(req, struct io_close); return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1); } int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_close *close = io_kiocb_to_cmd(req, struct io_close); if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; close->fd = READ_ONCE(sqe->fd); close->file_slot = READ_ONCE(sqe->file_index); if (close->file_slot && close->fd) return -EINVAL; return 0; } int io_close(struct io_kiocb *req, unsigned int issue_flags) { struct files_struct *files = current->files; struct io_close *close = io_kiocb_to_cmd(req, struct io_close); struct file *file; int ret = -EBADF; if (close->file_slot) { ret = io_close_fixed(req, issue_flags); goto err; } spin_lock(&files->file_lock); file = files_lookup_fd_locked(files, close->fd); if (!file || io_is_uring_fops(file)) { spin_unlock(&files->file_lock); goto err; } /* if the file has a flush method, be safe and punt to async */ if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { spin_unlock(&files->file_lock); return -EAGAIN; } file = file_close_fd_locked(files, close->fd); spin_unlock(&files->file_lock); if (!file) goto err; /* No ->flush() or already async, safely close from here */ ret = filp_close(file, current->files); err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fixed_install *ifi; unsigned int flags; if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || sqe->splice_fd_in || sqe->addr3) return -EINVAL; /* must be a fixed file */ if (!(req->flags & REQ_F_FIXED_FILE)) return -EBADF; flags = READ_ONCE(sqe->install_fd_flags); if (flags & ~IORING_FIXED_FD_NO_CLOEXEC) return -EINVAL; /* ensure the task's creds are used when installing/receiving fds */ if (req->flags & REQ_F_CREDS) return -EPERM; /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */ ifi = io_kiocb_to_cmd(req, struct io_fixed_install); ifi->o_flags = O_CLOEXEC; if (flags & IORING_FIXED_FD_NO_CLOEXEC) ifi->o_flags = 0; return 0; } int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_fixed_install *ifi; int ret; ifi = io_kiocb_to_cmd(req, struct io_fixed_install); ret = receive_fd(req->file, NULL, ifi->o_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 /* * Copyright (c) 2007-2011 Atheros Communications Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "core.h" #include "debug.h" #include "hif-ops.h" #define HTC_PACKET_CONTAINER_ALLOCATION 32 #define HTC_CONTROL_BUFFER_SIZE (HTC_MAX_CTRL_MSG_LEN + HTC_HDR_LENGTH) static int ath6kl_htc_pipe_tx(struct htc_target *handle, struct htc_packet *packet); static void ath6kl_htc_pipe_cleanup(struct htc_target *handle); /* htc pipe tx path */ static inline void restore_tx_packet(struct htc_packet *packet) { if (packet->info.tx.flags & HTC_FLAGS_TX_FIXUP_NETBUF) { skb_pull(packet->skb, sizeof(struct htc_frame_hdr)); packet->info.tx.flags &= ~HTC_FLAGS_TX_FIXUP_NETBUF; } } static void do_send_completion(struct htc_endpoint *ep, struct list_head *queue_to_indicate) { struct htc_packet *packet; if (list_empty(queue_to_indicate)) { /* nothing to indicate */ return; } if (ep->ep_cb.tx_comp_multi != NULL) { ath6kl_dbg(ATH6KL_DBG_HTC, "%s: calling ep %d, send complete multiple callback (%d pkts)\n", __func__, ep->eid, get_queue_depth(queue_to_indicate)); /* * a multiple send complete handler is being used, * pass the queue to the handler */ ep->ep_cb.tx_comp_multi(ep->target, queue_to_indicate); /* * all packets are now owned by the callback, * reset queue to be safe */ INIT_LIST_HEAD(queue_to_indicate); } else { /* using legacy EpTxComplete */ do { packet = list_first_entry(queue_to_indicate, struct htc_packet, list); list_del(&packet->list); ath6kl_dbg(ATH6KL_DBG_HTC, "%s: calling ep %d send complete callback on packet 0x%p\n", __func__, ep->eid, packet); ep->ep_cb.tx_complete(ep->target, packet); } while (!list_empty(queue_to_indicate)); } } static void send_packet_completion(struct htc_target *target, struct htc_packet *packet) { struct htc_endpoint *ep = &target->endpoint[packet->endpoint]; struct list_head container; restore_tx_packet(packet); INIT_LIST_HEAD(&container); list_add_tail(&packet->list, &container); /* do completion */ do_send_completion(ep, &container); } static void get_htc_packet_credit_based(struct htc_target *target, struct htc_endpoint *ep, struct list_head *queue) { int credits_required; int remainder; u8 send_flags; struct htc_packet *packet; unsigned int transfer_len; /* NOTE : the TX lock is held when this function is called */ /* loop until we can grab as many packets out of the queue as we can */ while (true) { send_flags = 0; if (list_empty(&ep->txq)) break; /* get packet at head, but don't remove it */ packet = list_first_entry(&ep->txq, struct htc_packet, list); ath6kl_dbg(ATH6KL_DBG_HTC, "%s: got head packet:0x%p , queue depth: %d\n", __func__, packet, get_queue_depth(&ep->txq)); transfer_len = packet->act_len + HTC_HDR_LENGTH; if (transfer_len <= target->tgt_cred_sz) { credits_required = 1; } else { /* figure out how many credits this message requires */ credits_required = transfer_len / target->tgt_cred_sz; remainder = transfer_len % target->tgt_cred_sz; if (remainder) credits_required++; } ath6kl_dbg(ATH6KL_DBG_HTC, "%s: creds required:%d got:%d\n", __func__, credits_required, ep->cred_dist.credits); if (ep->eid == ENDPOINT_0) { /* * endpoint 0 is special, it always has a credit and * does not require credit based flow control */ credits_required = 0; } else { if (ep->cred_dist.credits < credits_required) break; ep->cred_dist.credits -= credits_required; ep->ep_st.cred_cosumd += credits_required; /* check if we need credits back from the target */ if (ep->cred_dist.credits < ep->cred_dist.cred_per_msg) { /* tell the target we need credits ASAP! */ send_flags |= HTC_FLAGS_NEED_CREDIT_UPDATE; ep->ep_st.cred_low_indicate += 1; ath6kl_dbg(ATH6KL_DBG_HTC, "%s: host needs credits\n", __func__); } } /* now we can fully dequeue */ packet = list_first_entry(&ep->txq, struct htc_packet, list); list_del(&packet->list); /* save the number of credits this packet consumed */ packet->info.tx.cred_used = credits_required; /* save send flags */ packet->info.tx.flags = send_flags; packet->info.tx.seqno = ep->seqno; ep->seqno++; /* queue this packet into the caller's queue */ list_add_tail(&packet->list, queue); } } static void get_htc_packet(struct htc_target *target, struct htc_endpoint *ep, struct list_head *queue, int resources) { struct htc_packet *packet; /* NOTE : the TX lock is held when this function is called */ /* loop until we can grab as many packets out of the queue as we can */ while (resources) { if (list_empty(&ep->txq)) break; packet = list_first_entry(&ep->txq, struct htc_packet, list); list_del(&packet->list); ath6kl_dbg(ATH6KL_DBG_HTC, "%s: got packet:0x%p , new queue depth: %d\n", __func__, packet, get_queue_depth(&ep->txq)); packet->info.tx.seqno = ep->seqno; packet->info.tx.flags = 0; packet->info.tx.cred_used = 0; ep->seqno++; /* queue this packet into the caller's queue */ list_add_tail(&packet->list, queue); resources--; } } static int htc_issue_packets(struct htc_target *target, struct htc_endpoint *ep, struct list_head *pkt_queue) { int status = 0; u16 payload_len; struct sk_buff *skb; struct htc_frame_hdr *htc_hdr; struct htc_packet *packet; ath6kl_dbg(ATH6KL_DBG_HTC, "%s: queue: 0x%p, pkts %d\n", __func__, pkt_queue, get_queue_depth(pkt_queue)); while (!list_empty(pkt_queue)) { packet = list_first_entry(pkt_queue, struct htc_packet, list); list_del(&packet->list); skb = packet->skb; if (!skb) { WARN_ON_ONCE(1); status = -EINVAL; break; } payload_len = packet->act_len; /* setup HTC frame header */ htc_hdr = skb_push(skb, sizeof(*htc_hdr)); if (!htc_hdr) { WARN_ON_ONCE(1); status = -EINVAL; break; } packet->info.tx.flags |= HTC_FLAGS_TX_FIXUP_NETBUF; put_unaligned_le16(payload_len, &htc_hdr->payld_len); htc_hdr->flags = packet->info.tx.flags; htc_hdr->eid = (u8) packet->endpoint; htc_hdr->ctrl[0] = 0; htc_hdr->ctrl[1] = (u8) packet->info.tx.seqno; spin_lock_bh(&target->tx_lock); /* store in look up queue to match completions */ list_add_tail(&packet->list, &ep->pipe.tx_lookup_queue); ep->ep_st.tx_issued += 1; spin_unlock_bh(&target->tx_lock); status = ath6kl_hif_pipe_send(target->dev->ar, ep->pipe.pipeid_ul, NULL, skb); if (status != 0) { if (status != -ENOMEM) { /* TODO: if more than 1 endpoint maps to the * same PipeID, it is possible to run out of * resources in the HIF layer. * Don't emit the error */ ath6kl_dbg(ATH6KL_DBG_HTC, "%s: failed status:%d\n", __func__, status); } spin_lock_bh(&target->tx_lock); list_del(&packet->list); /* reclaim credits */ ep->cred_dist.credits += packet->info.tx.cred_used; spin_unlock_bh(&target->tx_lock); /* put it back into the callers queue */ list_add(&packet->list, pkt_queue); break; } } if (status != 0) { while (!list_empty(pkt_queue)) { if (status != -ENOMEM) { ath6kl_dbg(ATH6KL_DBG_HTC, "%s: failed pkt:0x%p status:%d\n", __func__, packet, status); } packet = list_first_entry(pkt_queue, struct htc_packet, list); list_del(&packet->list); packet->status = status; send_packet_completion(target, packet); } } return status; } static enum htc_send_queue_result htc_try_send(struct htc_target *target, struct htc_endpoint *ep, struct list_head *txq) { struct list_head send_queue; /* temp queue to hold packets */ struct htc_packet *packet, *tmp_pkt; struct ath6kl *ar = target->dev->ar; enum htc_send_full_action action; int tx_resources, overflow, txqueue_depth, i, good_pkts; u8 pipeid; ath6kl_dbg(ATH6KL_DBG_HTC, "%s: (queue:0x%p depth:%d)\n", __func__, txq, (txq == NULL) ? 0 : get_queue_depth(txq)); /* init the local send queue */ INIT_LIST_HEAD(&send_queue); /* * txq equals to NULL means * caller didn't provide a queue, just wants us to * check queues and send */ if (txq != NULL) { if (list_empty(txq)) { /* empty queue */ return HTC_SEND_QUEUE_DROP; } spin_lock_bh(&target->tx_lock); txqueue_depth = get_queue_depth(&ep->txq); spin_unlock_bh(&target->tx_lock); if (txqueue_depth >= ep->max_txq_depth) { /* we've already overflowed */ overflow = get_queue_depth(txq); } else { /* get how much we will overflow by */ overflow = txqueue_depth; overflow += get_queue_depth(txq); /* get how much we will overflow the TX queue by */ overflow -= ep->max_txq_depth; } /* if overflow is negative or zero, we are okay */ if (overflow > 0) { ath6kl_dbg(ATH6KL_DBG_HTC, "%s: Endpoint %d, TX queue will overflow :%d, Tx Depth:%d, Max:%d\n", __func__, ep->eid, overflow, txqueue_depth, ep->max_txq_depth); } if ((overflow <= 0) || (ep->ep_cb.tx_full == NULL)) { /* * all packets will fit or caller did not provide send * full indication handler -- just move all of them * to the local send_queue object */ list_splice_tail_init(txq, &send_queue); } else { good_pkts = get_queue_depth(txq) - overflow; if (good_pkts < 0) { WARN_ON_ONCE(1); return HTC_SEND_QUEUE_DROP; } /* we have overflowed, and a callback is provided */ /* dequeue all non-overflow packets to the sendqueue */ for (i = 0; i < good_pkts; i++) { /* pop off caller's queue */ packet = list_first_entry(txq, struct htc_packet, list); /* move to local queue */ list_move_tail(&packet->list, &send_queue); } /* * the caller's queue has all the packets that won't fit * walk through the caller's queue and indicate each to * the send full handler */ list_for_each_entry_safe(packet, tmp_pkt, txq, list) { ath6kl_dbg(ATH6KL_DBG_HTC, "%s: Indicate overflowed TX pkts: %p\n", __func__, packet); action = ep->ep_cb.tx_full(ep->target, packet); if (action == HTC_SEND_FULL_DROP) { /* callback wants the packet dropped */ ep->ep_st.tx_dropped += 1; /* leave this one in the caller's queue * for cleanup */ } else { /* callback wants to keep this packet, * move from caller's queue to the send * queue */ list_move_tail(&packet->list, &send_queue); } } if (list_empty(&send_queue)) { /* no packets made it in, caller will cleanup */ return HTC_SEND_QUEUE_DROP; } } } if (!ep->pipe.tx_credit_flow_enabled) { tx_resources = ath6kl_hif_pipe_get_free_queue_number(ar, ep->pipe.pipeid_ul); } else { tx_resources = 0; } spin_lock_bh(&target->tx_lock); if (!list_empty(&send_queue)) { /* transfer packets to tail */ list_splice_tail_init(&send_queue, &ep->txq); if (!list_empty(&send_queue)) { WARN_ON_ONCE(1); spin_unlock_bh(&target->tx_lock); return HTC_SEND_QUEUE_DROP; } INIT_LIST_HEAD(&send_queue); } /* increment tx processing count on entry */ ep->tx_proc_cnt++; if (ep->tx_proc_cnt > 1) { /* * Another thread or task is draining the TX queues on this * endpoint that thread will reset the tx processing count * when the queue is drained. */ ep->tx_proc_cnt--; spin_unlock_bh(&target->tx_lock); return HTC_SEND_QUEUE_OK; } /***** beyond this point only 1 thread may enter ******/ /* * Now drain the endpoint TX queue for transmission as long as we have * enough transmit resources. */ while (true) { if (get_queue_depth(&ep->txq) == 0) break; if (ep->pipe.tx_credit_flow_enabled) { /* * Credit based mechanism provides flow control * based on target transmit resource availability, * we assume that the HIF layer will always have * bus resources greater than target transmit * resources. */ get_htc_packet_credit_based(target, ep, &send_queue); } else { /* * Get all packets for this endpoint that we can * for this pass. */ get_htc_packet(target, ep, &send_queue, tx_resources); } if (get_queue_depth(&send_queue) == 0) { /* * Didn't get packets due to out of resources or TX * queue was drained. */ break; } spin_unlock_bh(&target->tx_lock); /* send what we can */ htc_issue_packets(target, ep, &send_queue); if (!ep->pipe.tx_credit_flow_enabled) { pipeid = ep->pipe.pipeid_ul; tx_resources = ath6kl_hif_pipe_get_free_queue_number(ar, pipeid); } spin_lock_bh(&target->tx_lock); } /* done with this endpoint, we can clear the count */ ep->tx_proc_cnt = 0; spin_unlock_bh(&target->tx_lock); return HTC_SEND_QUEUE_OK; } /* htc control packet manipulation */ static void destroy_htc_txctrl_packet(struct htc_packet *packet) { struct sk_buff *skb; skb = packet->skb; dev_kfree_skb(skb); kfree(packet); } static struct htc_packet *build_htc_txctrl_packet(void) { struct htc_packet *packet = NULL; struct sk_buff *skb; packet = kzalloc(sizeof(struct htc_packet), GFP_KERNEL); if (packet == NULL) return NULL; skb = __dev_alloc_skb(HTC_CONTROL_BUFFER_SIZE, GFP_KERNEL); if (skb == NULL) { kfree(packet); return NULL; } packet->skb = skb; return packet; } static void htc_free_txctrl_packet(struct htc_target *target, struct htc_packet *packet) { destroy_htc_txctrl_packet(packet); } static struct htc_packet *htc_alloc_txctrl_packet(struct htc_target *target) { return build_htc_txctrl_packet(); } static void htc_txctrl_complete(struct htc_target *target, struct htc_packet *packet) { htc_free_txctrl_packet(target, packet); } #define MAX_MESSAGE_SIZE 1536 static int htc_setup_target_buffer_assignments(struct htc_target *target) { int status, credits, credit_per_maxmsg, i; struct htc_pipe_txcredit_alloc *entry; unsigned int hif_usbaudioclass = 0; credit_per_maxmsg = MAX_MESSAGE_SIZE / target->tgt_cred_sz; if (MAX_MESSAGE_SIZE % target->tgt_cred_sz) credit_per_maxmsg++; /* TODO, this should be configured by the caller! */ credits = target->tgt_creds; entry = &target->pipe.txcredit_alloc[0]; status = -ENOMEM; /* FIXME: hif_usbaudioclass is always zero */ if (hif_usbaudioclass) { ath6kl_dbg(ATH6KL_DBG_HTC, "%s: For USB Audio Class- Total:%d\n", __func__, credits); entry++; entry++; /* Setup VO Service To have Max Credits */ entry->service_id = WMI_DATA_VO_SVC; entry->credit_alloc = (credits - 6); if (entry->credit_alloc == 0) entry->credit_alloc++; credits -= (int) entry->credit_alloc; if (credits <= 0) return status; entry++; entry->service_id = WMI_CONTROL_SVC; entry->credit_alloc = credit_per_maxmsg; credits -= (int) entry->credit_alloc; if (credits <= 0) return status; /* leftovers go to best effort */ entry++; entry++; entry->service_id = WMI_DATA_BE_SVC; entry->credit_alloc = (u8) credits; status = 0; } else { entry++; entry->service_id = WMI_DATA_VI_SVC; entry->credit_alloc = credits / 4; if (entry->credit_alloc == 0) entry->credit_alloc++; credits -= (int) entry->credit_alloc; if (credits <= 0) return status; entry++; entry->service_id = WMI_DATA_VO_SVC; entry->credit_alloc = credits / 4; if (entry->credit_alloc == 0) entry->credit_alloc++; credits -= (int) entry->credit_alloc; if (credits <= 0) return status; entry++; entry->service_id = WMI_CONTROL_SVC; entry->credit_alloc = credit_per_maxmsg; credits -= (int) entry->credit_alloc; if (credits <= 0) return status; entry++; entry->service_id = WMI_DATA_BK_SVC; entry->credit_alloc = credit_per_maxmsg; credits -= (int) entry->credit_alloc; if (credits <= 0) return status; /* leftovers go to best effort */ entry++; entry->service_id = WMI_DATA_BE_SVC; entry->credit_alloc = (u8) credits; status = 0; } if (status == 0) { for (i = 0; i < ENDPOINT_MAX; i++) { if (target->pipe.txcredit_alloc[i].service_id != 0) { ath6kl_dbg(ATH6KL_DBG_HTC, "HTC Service Index : %d TX : 0x%2.2X : alloc:%d\n", i, target->pipe.txcredit_alloc[i]. service_id, target->pipe.txcredit_alloc[i]. credit_alloc); } } } return status; } /* process credit reports and call distribution function */ static void htc_process_credit_report(struct htc_target *target, struct htc_credit_report *rpt, int num_entries, enum htc_endpoint_id from_ep) { int total_credits = 0, i; struct htc_endpoint *ep; /* lock out TX while we update credits */ spin_lock_bh(&target->tx_lock); for (i = 0; i < num_entries; i++, rpt++) { if (rpt->eid >= ENDPOINT_MAX) { WARN_ON_ONCE(1); spin_unlock_bh(&target->tx_lock); return; } ep = &target->endpoint[rpt->eid]; ep->cred_dist.credits += rpt->credits; if (ep->cred_dist.credits && get_queue_depth(&ep->txq)) { spin_unlock_bh(&target->tx_lock); htc_try_send(target, ep, NULL); spin_lock_bh(&target->tx_lock); } total_credits += rpt->credits; } ath6kl_dbg(ATH6KL_DBG_HTC, "Report indicated %d credits to distribute\n", total_credits); spin_unlock_bh(&target->tx_lock); } /* flush endpoint TX queue */ static void htc_flush_tx_endpoint(struct htc_target *target, struct htc_endpoint *ep, u16 tag) { struct htc_packet *packet; spin_lock_bh(&target->tx_lock); while (get_queue_depth(&ep->txq)) { packet = list_first_entry(&ep->txq, struct htc_packet, list); list_del(&packet->list); packet->status = 0; send_packet_completion(target, packet); } spin_unlock_bh(&target->tx_lock); } /* * In the adapted HIF layer, struct sk_buff * are passed between HIF and HTC, * since upper layers expects struct htc_packet containers we use the completed * skb and lookup it's corresponding HTC packet buffer from a lookup list. * This is extra overhead that can be fixed by re-aligning HIF interfaces with * HTC. */ static struct htc_packet *htc_lookup_tx_packet(struct htc_target *target, struct htc_endpoint *ep, struct sk_buff *skb) { struct htc_packet *packet, *tmp_pkt, *found_packet = NULL; spin_lock_bh(&target->tx_lock); /* * interate from the front of tx lookup queue * this lookup should be fast since lower layers completes in-order and * so the completed packet should be at the head of the list generally */ list_for_each_entry_safe(packet, tmp_pkt, &ep->pipe.tx_lookup_queue, list) { /* check for removal */ if (skb == packet->skb) { /* found it */ list_del(&packet->list); found_packet = packet; break; } } spin_unlock_bh(&target->tx_lock); return found_packet; } static int ath6kl_htc_pipe_tx_complete(struct ath6kl *ar, struct sk_buff *skb) { struct htc_target *target = ar->htc_target; struct htc_frame_hdr *htc_hdr; struct htc_endpoint *ep; struct htc_packet *packet; u8 ep_id, *netdata; netdata = skb->data; htc_hdr = (struct htc_frame_hdr *) netdata; ep_id = htc_hdr->eid; ep = &target->endpoint[ep_id]; packet = htc_lookup_tx_packet(target, ep, skb); if (packet == NULL) { /* may have already been flushed and freed */ ath6kl_err("HTC TX lookup failed!\n"); } else { /* will be giving this buffer back to upper layers */ packet->status = 0; send_packet_completion(target, packet); } skb = NULL; if (!ep->pipe.tx_credit_flow_enabled) { /* * note: when using TX credit flow, the re-checking of queues * happens when credits flow back from the target. in the * non-TX credit case, we recheck after the packet completes */ htc_try_send(target, ep, NULL); } return 0; } static int htc_send_packets_multiple(struct htc_target *target, struct list_head *pkt_queue) { struct htc_endpoint *ep; struct htc_packet *packet, *tmp_pkt; if (list_empty(pkt_queue)) return -EINVAL; /* get first packet to find out which ep the packets will go into */ packet = list_first_entry(pkt_queue, struct htc_packet, list); if (packet->endpoint >= ENDPOINT_MAX) { WARN_ON_ONCE(1); return -EINVAL; } ep = &target->endpoint[packet->endpoint]; htc_try_send(target, ep, pkt_queue); /* do completion on any packets that couldn't get in */ if (!list_empty(pkt_queue)) { list_for_each_entry_safe(packet, tmp_pkt, pkt_queue, list) { packet->status = -ENOMEM; } do_send_completion(ep, pkt_queue); } return 0; } /* htc pipe rx path */ static struct htc_packet *alloc_htc_packet_container(struct htc_target *target) { struct htc_packet *packet; spin_lock_bh(&target->rx_lock); if (target->pipe.htc_packet_pool == NULL) { spin_unlock_bh(&target->rx_lock); return NULL; } packet = target->pipe.htc_packet_pool; target->pipe.htc_packet_pool = (struct htc_packet *) packet->list.next; spin_unlock_bh(&target->rx_lock); packet->list.next = NULL; return packet; } static void free_htc_packet_container(struct htc_target *target, struct htc_packet *packet) { struct list_head *lh; spin_lock_bh(&target->rx_lock); if (target->pipe.htc_packet_pool == NULL) { target->pipe.htc_packet_pool = packet; packet->list.next = NULL; } else { lh = (struct list_head *) target->pipe.htc_packet_pool; packet->list.next = lh; target->pipe.htc_packet_pool = packet; } spin_unlock_bh(&target->rx_lock); } static int htc_process_trailer(struct htc_target *target, u8 *buffer, int len, enum htc_endpoint_id from_ep) { struct htc_credit_report *report; struct htc_record_hdr *record; u8 *record_buf; int status = 0; while (len > 0) { if (len < sizeof(struct htc_record_hdr)) { status = -EINVAL; break; } /* these are byte aligned structs */ record = (struct htc_record_hdr *) buffer; len -= sizeof(struct htc_record_hdr); buffer += sizeof(struct htc_record_hdr); if (record->len > len) { /* no room left in buffer for record */ ath6kl_dbg(ATH6KL_DBG_HTC, "invalid length: %d (id:%d) buffer has: %d bytes left\n", record->len, record->rec_id, len); status = -EINVAL; break; } /* start of record follows the header */ record_buf = buffer; switch (record->rec_id) { case HTC_RECORD_CREDITS: if (record->len < sizeof(struct htc_credit_report)) { WARN_ON_ONCE(1); return -EINVAL; } report = (struct htc_credit_report *) record_buf; htc_process_credit_report(target, report, record->len / sizeof(*report), from_ep); break; default: ath6kl_dbg(ATH6KL_DBG_HTC, "unhandled record: id:%d length:%d\n", record->rec_id, record->len); break; } /* advance buffer past this record for next time around */ buffer += record->len; len -= record->len; } return status; } static void do_recv_completion(struct htc_endpoint *ep, struct list_head *queue_to_indicate) { struct htc_packet *packet; if (list_empty(queue_to_indicate)) { /* nothing to indicate */ return; } /* using legacy EpRecv */ while (!list_empty(queue_to_indicate)) { packet = list_first_entry(queue_to_indicate, struct htc_packet, list); list_del(&packet->list); ep->ep_cb.rx(ep->target, packet); } return; } static void recv_packet_completion(struct htc_target *target, struct htc_endpoint *ep, struct htc_packet *packet) { struct list_head container; INIT_LIST_HEAD(&container); list_add_tail(&packet->list, &container); /* do completion */ do_recv_completion(ep, &container); } static int ath6kl_htc_pipe_rx_complete(struct ath6kl *ar, struct sk_buff *skb, u8 pipeid) { struct htc_target *target = ar->htc_target; u8 *netdata, *trailer, hdr_info; struct htc_frame_hdr *htc_hdr; u32 netlen, trailerlen = 0; struct htc_packet *packet; struct htc_endpoint *ep; u16 payload_len; int status = 0; /* * ar->htc_target can be NULL due to a race condition that can occur * during driver initialization(we do 'ath6kl_hif_power_on' before * initializing 'ar->htc_target' via 'ath6kl_htc_create'). * 'ath6kl_hif_power_on' assigns 'ath6kl_recv_complete' as * usb_complete_t/callback function for 'usb_fill_bulk_urb'. * Thus the possibility of ar->htc_target being NULL * via ath6kl_recv_complete -> ath6kl_usb_io_comp_work. */ if (!target) { ath6kl_dbg(ATH6KL_DBG_HTC, "Target not yet initialized\n"); status = -EINVAL; goto free_skb; } netdata = skb->data; netlen = skb->len; htc_hdr = (struct htc_frame_hdr *) netdata; if (htc_hdr->eid >= ENDPOINT_MAX) { ath6kl_dbg(ATH6KL_DBG_HTC, "HTC Rx: invalid EndpointID=%d\n", htc_hdr->eid); status = -EINVAL; goto free_skb; } ep = &target->endpoint[htc_hdr->eid]; payload_len = le16_to_cpu(get_unaligned(&htc_hdr->payld_len)); if (netlen < (payload_len + HTC_HDR_LENGTH)) { ath6kl_dbg(ATH6KL_DBG_HTC, "HTC Rx: insufficient length, got:%d expected =%zu\n", netlen, payload_len + HTC_HDR_LENGTH); status = -EINVAL; goto free_skb; } /* get flags to check for trailer */ hdr_info = htc_hdr->flags; if (hdr_info & HTC_FLG_RX_TRAILER) { /* extract the trailer length */ hdr_info = htc_hdr->ctrl[0]; if ((hdr_info < sizeof(struct htc_record_hdr)) || (hdr_info > payload_len)) { ath6kl_dbg(ATH6KL_DBG_HTC, "invalid header: payloadlen should be %d, CB[0]: %d\n", payload_len, hdr_info); status = -EINVAL; goto free_skb; } trailerlen = hdr_info; /* process trailer after hdr/apps payload */ trailer = (u8 *) htc_hdr + HTC_HDR_LENGTH + payload_len - hdr_info; status = htc_process_trailer(target, trailer, hdr_info, htc_hdr->eid); if (status != 0) goto free_skb; } if (((int) payload_len - (int) trailerlen) <= 0) { /* zero length packet with trailer, just drop these */ goto free_skb; } if (htc_hdr->eid == ENDPOINT_0) { /* handle HTC control message */ if (target->htc_flags & HTC_OP_STATE_SETUP_COMPLETE) { /* * fatal: target should not send unsolicited * messageson the endpoint 0 */ ath6kl_dbg(ATH6KL_DBG_HTC, "HTC ignores Rx Ctrl after setup complete\n"); status = -EINVAL; goto free_skb; } /* remove HTC header */ skb_pull(skb, HTC_HDR_LENGTH); netdata = skb->data; netlen = skb->len; spin_lock_bh(&target->rx_lock); target->pipe.ctrl_response_valid = true; target->pipe.ctrl_response_len = min_t(int, netlen, HTC_MAX_CTRL_MSG_LEN); memcpy(target->pipe.ctrl_response_buf, netdata, target->pipe.ctrl_response_len); spin_unlock_bh(&target->rx_lock); dev_kfree_skb(skb); skb = NULL; goto free_skb; } /* * TODO: the message based HIF architecture allocates net bufs * for recv packets since it bridges that HIF to upper layers, * which expects HTC packets, we form the packets here */ packet = alloc_htc_packet_container(target); if (packet == NULL) { status = -ENOMEM; goto free_skb; } packet->status = 0; packet->endpoint = htc_hdr->eid; packet->pkt_cntxt = skb; /* TODO: for backwards compatibility */ packet->buf = skb_push(skb, 0) + HTC_HDR_LENGTH; packet->act_len = netlen - HTC_HDR_LENGTH - trailerlen; /* * TODO: this is a hack because the driver layer will set the * actual len of the skb again which will just double the len */ skb_trim(skb, 0); recv_packet_completion(target, ep, packet); /* recover the packet container */ free_htc_packet_container(target, packet); skb = NULL; free_skb: dev_kfree_skb(skb); return status; } static void htc_flush_rx_queue(struct htc_target *target, struct htc_endpoint *ep) { struct list_head container; struct htc_packet *packet; spin_lock_bh(&target->rx_lock); while (1) { if (list_empty(&ep->rx_bufq)) break; packet = list_first_entry(&ep->rx_bufq, struct htc_packet, list); list_del(&packet->list); spin_unlock_bh(&target->rx_lock); packet->status = -ECANCELED; packet->act_len = 0; ath6kl_dbg(ATH6KL_DBG_HTC, "Flushing RX packet:0x%p, length:%d, ep:%d\n", packet, packet->buf_len, packet->endpoint); INIT_LIST_HEAD(&container); list_add_tail(&packet->list, &container); /* give the packet back */ do_recv_completion(ep, &container); spin_lock_bh(&target->rx_lock); } spin_unlock_bh(&target->rx_lock); } /* polling routine to wait for a control packet to be received */ static int htc_wait_recv_ctrl_message(struct htc_target *target) { int count = HTC_TARGET_RESPONSE_POLL_COUNT; while (count > 0) { spin_lock_bh(&target->rx_lock); if (target->pipe.ctrl_response_valid) { target->pipe.ctrl_response_valid = false; spin_unlock_bh(&target->rx_lock); break; } spin_unlock_bh(&target->rx_lock); count--; msleep_interruptible(HTC_TARGET_RESPONSE_POLL_WAIT); } if (count <= 0) { ath6kl_warn("htc pipe control receive timeout!\n"); return -ETIMEDOUT; } return 0; } static void htc_rxctrl_complete(struct htc_target *context, struct htc_packet *packet) { struct sk_buff *skb = packet->skb; if (packet->endpoint == ENDPOINT_0 && packet->status == -ECANCELED && skb != NULL) dev_kfree_skb(skb); } /* htc pipe initialization */ static void reset_endpoint_states(struct htc_target *target) { struct htc_endpoint *ep; int i; for (i = ENDPOINT_0; i < ENDPOINT_MAX; i++) { ep = &target->endpoint[i]; ep->svc_id = 0; ep->len_max = 0; ep->max_txq_depth = 0; ep->eid = i; INIT_LIST_HEAD(&ep->txq); INIT_LIST_HEAD(&ep->pipe.tx_lookup_queue); INIT_LIST_HEAD(&ep->rx_bufq); ep->target = target; ep->pipe.tx_credit_flow_enabled = true; } } /* start HTC, this is called after all services are connected */ static int htc_config_target_hif_pipe(struct htc_target *target) { return 0; } /* htc service functions */ static u8 htc_get_credit_alloc(struct htc_target *target, u16 service_id) { u8 allocation = 0; int i; for (i = 0; i < ENDPOINT_MAX; i++) { if (target->pipe.txcredit_alloc[i].service_id == service_id) allocation = target->pipe.txcredit_alloc[i].credit_alloc; } if (allocation == 0) { ath6kl_dbg(ATH6KL_DBG_HTC, "HTC Service TX : 0x%2.2X : allocation is zero!\n", service_id); } return allocation; } static int ath6kl_htc_pipe_conn_service(struct htc_target *target, struct htc_service_connect_req *conn_req, struct htc_service_connect_resp *conn_resp) { struct ath6kl *ar = target->dev->ar; struct htc_packet *packet = NULL; struct htc_conn_service_resp *resp_msg; struct htc_conn_service_msg *conn_msg; enum htc_endpoint_id assigned_epid = ENDPOINT_MAX; bool disable_credit_flowctrl = false; unsigned int max_msg_size = 0; struct htc_endpoint *ep; int length, status = 0; struct sk_buff *skb; u8 tx_alloc; u16 flags; if (conn_req->svc_id == 0) { WARN_ON_ONCE(1); status = -EINVAL; goto free_packet; } if (conn_req->svc_id == HTC_CTRL_RSVD_SVC) { /* special case for pseudo control service */ assigned_epid = ENDPOINT_0; max_msg_size = HTC_MAX_CTRL_MSG_LEN; tx_alloc = 0; } else { tx_alloc = htc_get_credit_alloc(target, conn_req->svc_id); if (tx_alloc == 0) { status = -ENOMEM; goto free_packet; } /* allocate a packet to send to the target */ packet = htc_alloc_txctrl_packet(target); if (packet == NULL) { WARN_ON_ONCE(1); status = -ENOMEM; goto free_packet; } skb = packet->skb; length = sizeof(struct htc_conn_service_msg); /* assemble connect service message */ conn_msg = skb_put(skb, length); if (conn_msg == NULL) { WARN_ON_ONCE(1); status = -EINVAL; goto free_packet; } memset(conn_msg, 0, sizeof(struct htc_conn_service_msg)); conn_msg->msg_id = cpu_to_le16(HTC_MSG_CONN_SVC_ID); conn_msg->svc_id = cpu_to_le16(conn_req->svc_id); conn_msg->conn_flags = cpu_to_le16(conn_req->conn_flags & ~HTC_CONN_FLGS_SET_RECV_ALLOC_MASK); /* tell target desired recv alloc for this ep */ flags = tx_alloc << HTC_CONN_FLGS_SET_RECV_ALLOC_SHIFT; conn_msg->conn_flags |= cpu_to_le16(flags); if (conn_req->conn_flags & HTC_CONN_FLGS_DISABLE_CRED_FLOW_CTRL) { disable_credit_flowctrl = true; } set_htc_pkt_info(packet, NULL, (u8 *) conn_msg, length, ENDPOINT_0, HTC_SERVICE_TX_PACKET_TAG); status = ath6kl_htc_pipe_tx(target, packet); /* we don't own it anymore */ packet = NULL; if (status != 0) goto free_packet; /* wait for response */ status = htc_wait_recv_ctrl_message(target); if (status != 0) goto free_packet; /* we controlled the buffer creation so it has to be * properly aligned */ resp_msg = (struct htc_conn_service_resp *) target->pipe.ctrl_response_buf; if (resp_msg->msg_id != cpu_to_le16(HTC_MSG_CONN_SVC_RESP_ID) || (target->pipe.ctrl_response_len < sizeof(*resp_msg))) { /* this message is not valid */ WARN_ON_ONCE(1); status = -EINVAL; goto free_packet; } ath6kl_dbg(ATH6KL_DBG_TRC, "%s: service 0x%X conn resp: status: %d ep: %d\n", __func__, resp_msg->svc_id, resp_msg->status, resp_msg->eid); conn_resp->resp_code = resp_msg->status; /* check response status */ if (resp_msg->status != HTC_SERVICE_SUCCESS) { ath6kl_dbg(ATH6KL_DBG_HTC, "Target failed service 0x%X connect request (status:%d)\n", resp_msg->svc_id, resp_msg->status); status = -EINVAL; goto free_packet; } assigned_epid = (enum htc_endpoint_id) resp_msg->eid; max_msg_size = le16_to_cpu(resp_msg->max_msg_sz); } /* the rest are parameter checks so set the error status */ status = -EINVAL; if (assigned_epid >= ENDPOINT_MAX) { WARN_ON_ONCE(1); goto free_packet; } if (max_msg_size == 0) { WARN_ON_ONCE(1); goto free_packet; } ep = &target->endpoint[assigned_epid]; ep->eid = assigned_epid; if (ep->svc_id != 0) { /* endpoint already in use! */ WARN_ON_ONCE(1); goto free_packet; } /* return assigned endpoint to caller */ conn_resp->endpoint = assigned_epid; conn_resp->len_max = max_msg_size; /* setup the endpoint */ ep->svc_id = conn_req->svc_id; /* this marks ep in use */ ep->max_txq_depth = conn_req->max_txq_depth; ep->len_max = max_msg_size; ep->cred_dist.credits = tx_alloc; ep->cred_dist.cred_sz = target->tgt_cred_sz; ep->cred_dist.cred_per_msg = max_msg_size / target->tgt_cred_sz; if (max_msg_size % target->tgt_cred_sz) ep->cred_dist.cred_per_msg++; /* copy all the callbacks */ ep->ep_cb = conn_req->ep_cb; /* initialize tx_drop_packet_threshold */ ep->tx_drop_packet_threshold = MAX_HI_COOKIE_NUM; status = ath6kl_hif_pipe_map_service(ar, ep->svc_id, &ep->pipe.pipeid_ul, &ep->pipe.pipeid_dl); if (status != 0) goto free_packet; ath6kl_dbg(ATH6KL_DBG_HTC, "SVC Ready: 0x%4.4X: ULpipe:%d DLpipe:%d id:%d\n", ep->svc_id, ep->pipe.pipeid_ul, ep->pipe.pipeid_dl, ep->eid); if (disable_credit_flowctrl && ep->pipe.tx_credit_flow_enabled) { ep->pipe.tx_credit_flow_enabled = false; ath6kl_dbg(ATH6KL_DBG_HTC, "SVC: 0x%4.4X ep:%d TX flow control off\n", ep->svc_id, assigned_epid); } free_packet: if (packet != NULL) htc_free_txctrl_packet(target, packet); return status; } /* htc export functions */ static void *ath6kl_htc_pipe_create(struct ath6kl *ar) { int status = 0; struct htc_endpoint *ep = NULL; struct htc_target *target = NULL; struct htc_packet *packet; int i; target = kzalloc(sizeof(struct htc_target), GFP_KERNEL); if (target == NULL) { ath6kl_err("htc create unable to allocate memory\n"); status = -ENOMEM; goto fail_htc_create; } spin_lock_init(&target->htc_lock); spin_lock_init(&target->rx_lock); spin_lock_init(&target->tx_lock); reset_endpoint_states(target); for (i = 0; i < HTC_PACKET_CONTAINER_ALLOCATION; i++) { packet = kzalloc(sizeof(struct htc_packet), GFP_KERNEL); if (packet != NULL) free_htc_packet_container(target, packet); } target->dev = kzalloc(sizeof(*target->dev), GFP_KERNEL); if (!target->dev) { ath6kl_err("unable to allocate memory\n"); status = -ENOMEM; goto fail_htc_create; } target->dev->ar = ar; target->dev->htc_cnxt = target; /* Get HIF default pipe for HTC message exchange */ ep = &target->endpoint[ENDPOINT_0]; ath6kl_hif_pipe_get_default(ar, &ep->pipe.pipeid_ul, &ep->pipe.pipeid_dl); return target; fail_htc_create: if (status != 0) { if (target != NULL) ath6kl_htc_pipe_cleanup(target); target = NULL; } return target; } /* cleanup the HTC instance */ static void ath6kl_htc_pipe_cleanup(struct htc_target *target) { struct htc_packet *packet; while (true) { packet = alloc_htc_packet_container(target); if (packet == NULL) break; kfree(packet); } kfree(target->dev); /* kfree our instance */ kfree(target); } static int ath6kl_htc_pipe_start(struct htc_target *target) { struct sk_buff *skb; struct htc_setup_comp_ext_msg *setup; struct htc_packet *packet; htc_config_target_hif_pipe(target); /* allocate a buffer to send */ packet = htc_alloc_txctrl_packet(target); if (packet == NULL) { WARN_ON_ONCE(1); return -ENOMEM; } skb = packet->skb; /* assemble setup complete message */ setup = skb_put(skb, sizeof(*setup)); memset(setup, 0, sizeof(struct htc_setup_comp_ext_msg)); setup->msg_id = cpu_to_le16(HTC_MSG_SETUP_COMPLETE_EX_ID); ath6kl_dbg(ATH6KL_DBG_HTC, "HTC using TX credit flow control\n"); set_htc_pkt_info(packet, NULL, (u8 *) setup, sizeof(struct htc_setup_comp_ext_msg), ENDPOINT_0, HTC_SERVICE_TX_PACKET_TAG); target->htc_flags |= HTC_OP_STATE_SETUP_COMPLETE; return ath6kl_htc_pipe_tx(target, packet); } static void ath6kl_htc_pipe_stop(struct htc_target *target) { int i; struct htc_endpoint *ep; /* cleanup endpoints */ for (i = 0; i < ENDPOINT_MAX; i++) { ep = &target->endpoint[i]; htc_flush_rx_queue(target, ep); htc_flush_tx_endpoint(target, ep, HTC_TX_PACKET_TAG_ALL); } reset_endpoint_states(target); target->htc_flags &= ~HTC_OP_STATE_SETUP_COMPLETE; } static int ath6kl_htc_pipe_get_rxbuf_num(struct htc_target *target, enum htc_endpoint_id endpoint) { int num; spin_lock_bh(&target->rx_lock); num = get_queue_depth(&(target->endpoint[endpoint].rx_bufq)); spin_unlock_bh(&target->rx_lock); return num; } static int ath6kl_htc_pipe_tx(struct htc_target *target, struct htc_packet *packet) { struct list_head queue; ath6kl_dbg(ATH6KL_DBG_HTC, "%s: endPointId: %d, buffer: 0x%p, length: %d\n", __func__, packet->endpoint, packet->buf, packet->act_len); INIT_LIST_HEAD(&queue); list_add_tail(&packet->list, &queue); return htc_send_packets_multiple(target, &queue); } static int ath6kl_htc_pipe_wait_target(struct htc_target *target) { struct htc_ready_ext_msg *ready_msg; struct htc_service_connect_req connect; struct htc_service_connect_resp resp; int status = 0; status = htc_wait_recv_ctrl_message(target); if (status != 0) return status; if (target->pipe.ctrl_response_len < sizeof(*ready_msg)) { ath6kl_warn("invalid htc pipe ready msg len: %d\n", target->pipe.ctrl_response_len); return -ECOMM; } ready_msg = (struct htc_ready_ext_msg *) target->pipe.ctrl_response_buf; if (ready_msg->ver2_0_info.msg_id != cpu_to_le16(HTC_MSG_READY_ID)) { ath6kl_warn("invalid htc pipe ready msg: 0x%x\n", ready_msg->ver2_0_info.msg_id); return -ECOMM; } ath6kl_dbg(ATH6KL_DBG_HTC, "Target Ready! : transmit resources : %d size:%d\n", ready_msg->ver2_0_info.cred_cnt, ready_msg->ver2_0_info.cred_sz); target->tgt_creds = le16_to_cpu(ready_msg->ver2_0_info.cred_cnt); target->tgt_cred_sz = le16_to_cpu(ready_msg->ver2_0_info.cred_sz); if ((target->tgt_creds == 0) || (target->tgt_cred_sz == 0)) return -ECOMM; htc_setup_target_buffer_assignments(target); /* setup our pseudo HTC control endpoint connection */ memset(&connect, 0, sizeof(connect)); memset(&resp, 0, sizeof(resp)); connect.ep_cb.tx_complete = htc_txctrl_complete; connect.ep_cb.rx = htc_rxctrl_complete; connect.max_txq_depth = NUM_CONTROL_TX_BUFFERS; connect.svc_id = HTC_CTRL_RSVD_SVC; /* connect fake service */ status = ath6kl_htc_pipe_conn_service(target, &connect, &resp); return status; } static void ath6kl_htc_pipe_flush_txep(struct htc_target *target, enum htc_endpoint_id endpoint, u16 tag) { struct htc_endpoint *ep = &target->endpoint[endpoint]; if (ep->svc_id == 0) { WARN_ON_ONCE(1); /* not in use.. */ return; } htc_flush_tx_endpoint(target, ep, tag); } static int ath6kl_htc_pipe_add_rxbuf_multiple(struct htc_target *target, struct list_head *pkt_queue) { struct htc_packet *packet, *tmp_pkt, *first; struct htc_endpoint *ep; int status = 0; if (list_empty(pkt_queue)) return -EINVAL; first = list_first_entry(pkt_queue, struct htc_packet, list); if (first->endpoint >= ENDPOINT_MAX) { WARN_ON_ONCE(1); return -EINVAL; } ath6kl_dbg(ATH6KL_DBG_HTC, "%s: epid: %d, cnt:%d, len: %d\n", __func__, first->endpoint, get_queue_depth(pkt_queue), first->buf_len); ep = &target->endpoint[first->endpoint]; spin_lock_bh(&target->rx_lock); /* store receive packets */ list_splice_tail_init(pkt_queue, &ep->rx_bufq); spin_unlock_bh(&target->rx_lock); if (status != 0) { /* walk through queue and mark each one canceled */ list_for_each_entry_safe(packet, tmp_pkt, pkt_queue, list) { packet->status = -ECANCELED; } do_recv_completion(ep, pkt_queue); } return status; } static void ath6kl_htc_pipe_activity_changed(struct htc_target *target, enum htc_endpoint_id ep, bool active) { /* TODO */ } static void ath6kl_htc_pipe_flush_rx_buf(struct htc_target *target) { struct htc_endpoint *endpoint; struct htc_packet *packet, *tmp_pkt; int i; for (i = ENDPOINT_0; i < ENDPOINT_MAX; i++) { endpoint = &target->endpoint[i]; spin_lock_bh(&target->rx_lock); list_for_each_entry_safe(packet, tmp_pkt, &endpoint->rx_bufq, list) { list_del(&packet->list); spin_unlock_bh(&target->rx_lock); ath6kl_dbg(ATH6KL_DBG_HTC, "htc rx flush pkt 0x%p len %d ep %d\n", packet, packet->buf_len, packet->endpoint); dev_kfree_skb(packet->pkt_cntxt); spin_lock_bh(&target->rx_lock); } spin_unlock_bh(&target->rx_lock); } } static int ath6kl_htc_pipe_credit_setup(struct htc_target *target, struct ath6kl_htc_credit_info *info) { return 0; } static const struct ath6kl_htc_ops ath6kl_htc_pipe_ops = { .create = ath6kl_htc_pipe_create, .wait_target = ath6kl_htc_pipe_wait_target, .start = ath6kl_htc_pipe_start, .conn_service = ath6kl_htc_pipe_conn_service, .tx = ath6kl_htc_pipe_tx, .stop = ath6kl_htc_pipe_stop, .cleanup = ath6kl_htc_pipe_cleanup, .flush_txep = ath6kl_htc_pipe_flush_txep, .flush_rx_buf = ath6kl_htc_pipe_flush_rx_buf, .activity_changed = ath6kl_htc_pipe_activity_changed, .get_rxbuf_num = ath6kl_htc_pipe_get_rxbuf_num, .add_rxbuf_multiple = ath6kl_htc_pipe_add_rxbuf_multiple, .credit_setup = ath6kl_htc_pipe_credit_setup, .tx_complete = ath6kl_htc_pipe_tx_complete, .rx_complete = ath6kl_htc_pipe_rx_complete, }; void ath6kl_htc_pipe_attach(struct ath6kl *ar) { ar->htc_ops = &ath6kl_htc_pipe_ops; }
12 12 4382 4386 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_address.h> #include <linux/of_iommu.h> #include <linux/of_reserved_mem.h> #include <linux/dma-direct.h> /* for bus_dma_region */ #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/mod_devicetable.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <asm/errno.h> #include "of_private.h" /** * of_match_device - Tell if a struct device matches an of_device_id list * @matches: array of of device match structures to search in * @dev: the of device structure to match against * * Used by a driver to check whether an platform_device present in the * system is in its list of supported devices. */ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } EXPORT_SYMBOL(of_match_device); static void of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { struct device_node *node, *of_node = dev->of_node; int count, i; if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) return; count = of_property_count_elems_of_size(of_node, "memory-region", sizeof(u32)); /* * If dev->of_node doesn't exist or doesn't contain memory-region, try * the OF node having DMA configuration. */ if (count <= 0) { of_node = np; count = of_property_count_elems_of_size( of_node, "memory-region", sizeof(u32)); } for (i = 0; i < count; i++) { node = of_parse_phandle(of_node, "memory-region", i); /* * There might be multiple memory regions, but only one * restricted-dma-pool region is allowed. */ if (of_device_is_compatible(node, "restricted-dma-pool") && of_device_is_available(node)) { of_node_put(node); break; } of_node_put(node); } /* * Attempt to initialize a restricted-dma-pool region if one was found. * Note that count can hold a negative error code. */ if (i < count && of_reserved_mem_device_init_by_idx(dev, of_node, i)) dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n"); } /** * of_dma_configure_id - Setup DMA configuration * @dev: Device to apply DMA configuration * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. * * If platform code needs to use its own special DMA configuration, it * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 mask, end = 0; bool coherent, set_map = false; int ret; if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else bus_np = of_node_get(np); ret = of_dma_get_range(bus_np, &map); of_node_put(bus_np); if (ret < 0) { /* * For legacy reasons, we have to assume some devices need * DMA configuration regardless of whether "dma-ranges" is * correctly specified or not. */ if (!force_dma) return ret == -ENODEV ? 0 : ret; } else { /* Determine the overall bounds of all DMA regions */ end = dma_range_map_max(map); set_map = true; } /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For * now, we'll continue the legacy behaviour of coercing it to the * coherent mask if not, but we'll no longer do so quietly. */ if (!dev->dma_mask) { dev_warn(dev, "DMA mask not set\n"); dev->dma_mask = &dev->coherent_dma_mask; } if (!end && dev->coherent_dma_mask) end = dev->coherent_dma_mask; else if (!end) end = (1ULL << 32) - 1; /* * Limit coherent and dma mask based on size and default mask * set by the driver. */ mask = DMA_BIT_MASK(ilog2(end) + 1); dev->coherent_dma_mask &= mask; *dev->dma_mask &= mask; /* ...but only set bus limit and range map if we found valid dma-ranges earlier */ if (set_map) { dev->bus_dma_limit = end; dev->dma_range_map = map; } coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); ret = of_iommu_configure(dev, np, id); if (ret == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (set_map) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; } /* Take all other IOMMU errors to mean we'll just carry on without it */ dev_dbg(dev, "device is%sbehind an iommu\n", !ret ? " " : " not "); arch_setup_dma_ops(dev, coherent); if (ret) of_dma_set_restricted_buffer(dev, np); return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); const void *of_device_get_match_data(const struct device *dev) { const struct of_device_id *match; match = of_match_device(dev->driver->of_match_table, dev); if (!match) return NULL; return match->data; } EXPORT_SYMBOL(of_device_get_match_data); /** * of_device_modalias - Fill buffer with newline terminated modalias string * @dev: Calling device * @str: Modalias string * @len: Size of @str */ ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len) { ssize_t sl; if (!dev || !dev->of_node || dev->of_node_reused) return -ENODEV; sl = of_modalias(dev->of_node, str, len - 2); if (sl < 0) return sl; if (sl > len - 2) return -ENOMEM; str[sl++] = '\n'; str[sl] = 0; return sl; } EXPORT_SYMBOL_GPL(of_device_modalias); /** * of_device_uevent - Display OF related uevent information * @dev: Device to display the uevent information for * @env: Kernel object's userspace event reference to fill up */ void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const char *compat, *type; struct alias_prop *app; struct property *p; int seen = 0; if ((!dev) || (!dev->of_node)) return; add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node); add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node); type = of_node_get_device_type(dev->of_node); if (type) add_uevent_var(env, "OF_TYPE=%s", type); /* Since the compatible field can contain pretty much anything * it's not really legal to split it out with commas. We split it * up using a number of environment variables instead. */ of_property_for_each_string(dev->of_node, "compatible", p, compat) { add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat); seen++; } add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen); seen = 0; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (dev->of_node == app->np) { add_uevent_var(env, "OF_ALIAS_%d=%s", seen, app->alias); seen++; } } mutex_unlock(&of_mutex); } EXPORT_SYMBOL_GPL(of_device_uevent); int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { int sl; if ((!dev) || (!dev->of_node) || dev->of_node_reused) return -ENODEV; /* Devicetree modalias is tricky, we add it in 2 steps */ if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; sl = of_modalias(dev->of_node, &env->buf[env->buflen-1], sizeof(env->buf) - env->buflen); if (sl < 0) return sl; if (sl >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += sl; return 0; } EXPORT_SYMBOL_GPL(of_device_uevent_modalias); /** * of_device_make_bus_id - Use the device node data to assign a unique name * @dev: pointer to device structure that is linked to a device tree node * * This routine will first try using the translated bus address to * derive a unique name. If it cannot, then it will prepend names from * parent nodes until a unique name can be derived. */ void of_device_make_bus_id(struct device *dev) { struct device_node *node = dev->of_node; const __be32 *reg; u64 addr; u32 mask; /* Construct the name, using parent nodes if necessary to ensure uniqueness */ while (node->parent) { /* * If the address can be translated, then that is as much * uniqueness as we need. Make it the first component and return */ reg = of_get_property(node, "reg", NULL); if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) { if (!of_property_read_u32(node, "mask", &mask)) dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn", addr, ffs(mask) - 1, node, dev_name(dev)); else dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn", addr, node, dev_name(dev)); return; } /* format arguments only used if dev_name() resolves to NULL */ dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s", kbasename(node->full_name), dev_name(dev)); node = node->parent; } } EXPORT_SYMBOL_GPL(of_device_make_bus_id);
5 5 5 2 5 5 5 2 3 5 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 * Phillip Lougher <phillip@squashfs.org.uk> */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mutex.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "page_actor.h" /* Read separately compressed datablock directly into page cache */ int squashfs_readpage_block(struct folio *folio, u64 block, int bsize, int expected) { struct page *target_page = &folio->page; struct inode *inode = folio->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; loff_t index; int i, pages, bytes, res = -ENOMEM; struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; if (end_index > file_end) end_index = file_end; pages = end_index - start_index + 1; page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); if (page == NULL) return res; /* Try to grab all the pages covered by the Squashfs block */ for (i = 0, index = start_index; index <= end_index; index++) { page[i] = (index == folio->index) ? target_page : grab_cache_page_nowait(folio->mapping, index); if (page[i] == NULL) continue; if (PageUptodate(page[i])) { unlock_page(page[i]); put_page(page[i]); continue; } i++; } pages = i; /* * Create a "page actor" which will kmap and kunmap the * page cache pages appropriately within the decompressor */ actor = squashfs_page_actor_init_special(msblk, page, pages, expected, start_index << PAGE_SHIFT); if (actor == NULL) goto out; /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); last_page = squashfs_page_actor_free(actor); if (res < 0) goto mark_errored; if (res != expected || IS_ERR(last_page)) { res = -EIO; goto mark_errored; } /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; if (end_index == file_end && last_page && bytes) { pageaddr = kmap_local_page(last_page); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); kunmap_local(pageaddr); } /* Mark pages as uptodate, unlock and release */ for (i = 0; i < pages; i++) { flush_dcache_page(page[i]); SetPageUptodate(page[i]); unlock_page(page[i]); if (page[i] != target_page) put_page(page[i]); } kfree(page); return 0; mark_errored: /* Decompression failed. Target_page is * dealt with by the caller */ for (i = 0; i < pages; i++) { if (page[i] == NULL || page[i] == target_page) continue; flush_dcache_page(page[i]); unlock_page(page[i]); put_page(page[i]); } out: kfree(page); return res; }
932 930 934 935 933 934 119 238 162 163 2 40 40 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Pseudo-driver for the loopback interface. * * Version: @(#)loopback.c 1.0.4b 08/16/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@scyld.com> * * Alan Cox : Fixed oddments for NET3.014 * Alan Cox : Rejig for NET3.029 snap #3 * Alan Cox : Fixed NET3.029 bugs and sped up * Larry McVoy : Tiny tweak to double performance * Alan Cox : Backed out LMV's tweak - the linux mm * can't take it... * Michael Griffith: Don't bother computing the checksums * on packets received on the loopback * interface. * Alexey Kuznetsov: Potential hang under some extreme * cases removed. */ #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/ethtool.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/checksum.h> #include <linux/if_ether.h> /* For the statistics structure. */ #include <linux/if_arp.h> /* For ARPHRD_ETHER */ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/percpu.h> #include <linux/net_tstamp.h> #include <net/net_namespace.h> #include <linux/u64_stats_sync.h> /* blackhole_netdev - a device used for dsts that are marked expired! * This is global device (instead of per-net-ns) since it's not needed * to be per-ns and gets initialized at boot time. */ struct net_device *blackhole_netdev; EXPORT_SYMBOL(blackhole_netdev); /* The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ static netdev_tx_t loopback_xmit(struct sk_buff *skb, struct net_device *dev) { int len; skb_tx_timestamp(skb); /* do not fool net_timestamp_check() with various clock bases */ skb_clear_tstamp(skb); skb_orphan(skb); /* Before queueing this packet to __netif_rx(), * make sure dst is refcounted. */ skb_dst_force(skb); skb->protocol = eth_type_trans(skb, dev); len = skb->len; if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_lstats_add(dev, len); return NETDEV_TX_OK; } void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes) { int i; *packets = 0; *bytes = 0; for_each_possible_cpu(i) { const struct pcpu_lstats *lb_stats; u64 tbytes, tpackets; unsigned int start; lb_stats = per_cpu_ptr(dev->lstats, i); do { start = u64_stats_fetch_begin(&lb_stats->syncp); tpackets = u64_stats_read(&lb_stats->packets); tbytes = u64_stats_read(&lb_stats->bytes); } while (u64_stats_fetch_retry(&lb_stats->syncp, start)); *bytes += tbytes; *packets += tpackets; } } EXPORT_SYMBOL(dev_lstats_read); static void loopback_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { u64 packets, bytes; dev_lstats_read(dev, &packets, &bytes); stats->rx_packets = packets; stats->tx_packets = packets; stats->rx_bytes = bytes; stats->tx_bytes = bytes; } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops loopback_ethtool_ops = { .get_link = always_on, .get_ts_info = ethtool_op_get_ts_info, }; static int loopback_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return 0; } static void loopback_dev_free(struct net_device *dev) { dev_net(dev)->loopback_dev = NULL; } static const struct net_device_ops loopback_ops = { .ndo_init = loopback_dev_init, .ndo_start_xmit = loopback_xmit, .ndo_get_stats64 = loopback_get_stats64, .ndo_set_mac_address = eth_mac_addr, }; static void gen_lo_setup(struct net_device *dev, unsigned int mtu, const struct ethtool_ops *eth_ops, const struct header_ops *hdr_ops, const struct net_device_ops *dev_ops, void (*dev_destructor)(struct net_device *dev)) { dev->mtu = mtu; dev->hard_header_len = ETH_HLEN; /* 14 */ dev->min_header_len = ETH_HLEN; /* 14 */ dev->addr_len = ETH_ALEN; /* 6 */ dev->type = ARPHRD_LOOPBACK; /* 0x0001*/ dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; dev->netns_local = true; netif_keep_dst(dev); dev->hw_features = NETIF_F_GSO_SOFTWARE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; dev->ethtool_ops = eth_ops; dev->header_ops = hdr_ops; dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; dev->priv_destructor = dev_destructor; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* The loopback device is special. There is only one instance * per network namespace. */ static void loopback_setup(struct net_device *dev) { gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops, &loopback_ops, loopback_dev_free); } /* Setup and register the loopback device. */ static __net_init int loopback_net_init(struct net *net) { struct net_device *dev; int err; err = -ENOMEM; dev = alloc_netdev(0, "lo", NET_NAME_PREDICTABLE, loopback_setup); if (!dev) goto out; dev_net_set(dev, net); err = register_netdev(dev); if (err) goto out_free_netdev; BUG_ON(dev->ifindex != LOOPBACK_IFINDEX); net->loopback_dev = dev; return 0; out_free_netdev: free_netdev(dev); out: if (net_eq(net, &init_net)) panic("loopback: Failed to register netdevice: %d\n", err); return err; } /* Registered in net/core/dev.c */ struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, }; /* blackhole netdevice */ static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, struct net_device *dev) { kfree_skb(skb); net_warn_ratelimited("%s(): Dropping skb.\n", __func__); return NETDEV_TX_OK; } static const struct net_device_ops blackhole_netdev_ops = { .ndo_start_xmit = blackhole_netdev_xmit, }; /* This is a dst-dummy device used specifically for invalidated * DSTs and unlike loopback, this is not per-ns. */ static void blackhole_netdev_setup(struct net_device *dev) { gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL); } /* Setup and register the blackhole_netdev. */ static int __init blackhole_netdev_init(void) { blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN, blackhole_netdev_setup); if (!blackhole_netdev) return -ENOMEM; rtnl_net_lock(&init_net); dev_init_scheduler(blackhole_netdev); dev_activate(blackhole_netdev); rtnl_net_unlock(&init_net); blackhole_netdev->flags |= IFF_UP | IFF_RUNNING; return 0; } device_initcall(blackhole_netdev_init);
3 3 3 3 3 3 3 3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/checkpoint.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1999 Red Hat Software --- All Rights Reserved * * Checkpoint routines for the generic filesystem journaling code. * Part of the ext2fs journaling system. * * Checkpointing is the process of ensuring that a section of the log is * committed fully to disk, so that that portion of the log can be * reused. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <trace/events/jbd2.h> /* * Unlink a buffer from a transaction checkpoint list. * * Called with j_list_lock held. */ static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction = jh->b_cp_transaction; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; if (transaction->t_checkpoint_list == jh) { transaction->t_checkpoint_list = jh->b_cpnext; if (transaction->t_checkpoint_list == jh) transaction->t_checkpoint_list = NULL; } } /* * __jbd2_log_wait_for_space: wait until there is space in the journal. * * Called under j-state_lock *only*. It will be unlocked if we have to wait * for a checkpoint to free up some space in the log. */ void __jbd2_log_wait_for_space(journal_t *journal) __acquires(&journal->j_state_lock) __releases(&journal->j_state_lock) { int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ nblocks = journal->j_max_transaction_buffers; while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); mutex_lock_io(&journal->j_checkpoint_mutex); /* * Test again, another process may have checkpointed while we * were waiting for the checkpoint lock. If there are no * transactions ready to be checkpointed, try to recover * journal space by calling cleanup_journal_tail(), and if * that doesn't work, by waiting for the currently committing * transaction to complete. If there is absolutely no way * to make progress, this is either a BUG or corrupted * filesystem, so abort the journal and leave a stack * trace for forensic evidence. */ write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) { mutex_unlock(&journal->j_checkpoint_mutex); return; } spin_lock(&journal->j_list_lock); space_left = jbd2_log_space_left(journal); if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; tid_t tid = 0; bool has_transaction = false; if (journal->j_committing_transaction) { tid = journal->j_committing_transaction->t_tid; has_transaction = true; } spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); } else if (jbd2_cleanup_journal_tail(journal) <= 0) { /* * We were able to recover space or the * journal was aborted due to an error. */ ; } else if (has_transaction) { /* * jbd2_journal_commit_transaction() may want * to take the checkpoint_mutex if JBD2_FLUSHED * is set. So we need to temporarily drop it. */ mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); write_lock(&journal->j_state_lock); continue; } else { printk(KERN_ERR "%s: needed %d blocks and " "only had %d space available\n", __func__, nblocks, space_left); printk(KERN_ERR "%s: no way to get more " "journal space in %s\n", __func__, journal->j_devname); WARN_ON(1); jbd2_journal_abort(journal, -EIO); } write_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); } mutex_unlock(&journal->j_checkpoint_mutex); } } static void __flush_batch(journal_t *journal, int *batch_count) { int i; struct blk_plug plug; blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC); blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; BUFFER_TRACE(bh, "brelse"); __brelse(bh); journal->j_chkpt_bhs[i] = NULL; } *batch_count = 0; } /* * Perform an actual checkpoint. We take the first transaction on the * list of transactions to be checkpointed and send all its buffers * to disk. We submit larger chunks of data at once. * * The journal should be locked before calling this function. * Called with j_checkpoint_mutex held. */ int jbd2_log_do_checkpoint(journal_t *journal) { struct journal_head *jh; struct buffer_head *bh; transaction_t *transaction; tid_t this_tid; int result, batch_count = 0; jbd2_debug(1, "Start checkpoint\n"); /* * First thing: if there are any transactions in the log which * don't need checkpointing, just eliminate them from the * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); trace_jbd2_checkpoint(journal, result); jbd2_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; /* * OK, we need to start writing disk blocks. Take one transaction * and write it. */ spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; transaction = journal->j_checkpoint_transactions; if (transaction->t_chp_stats.cs_chp_time == 0) transaction->t_chp_stats.cs_chp_time = jiffies; this_tid = transaction->t_tid; restart: /* * If someone cleaned up this transaction while we slept, we're * done (maybe it's a new transaction, but it fell at the same * address). */ if (journal->j_checkpoint_transactions != transaction || transaction->t_tid != this_tid) goto out; /* checkpoint all of the transaction's buffers */ while (transaction->t_checkpoint_list) { jh = transaction->t_checkpoint_list; bh = jh2bh(jh); if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* * The journal thread is dead; so * starting and waiting for a commit * to finish will cause us to wait for * a _very_ long time. */ printk(KERN_ERR "JBD2: %s: Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); if (batch_count) __flush_batch(journal, &batch_count); jbd2_log_start_commit(journal, tid); /* * jbd2_journal_commit_transaction() may want * to take the checkpoint_mutex if JBD2_FLUSHED * is set, jbd2_update_log_tail() called by * jbd2_journal_commit_transaction() may also take * checkpoint_mutex. So we need to temporarily * drop it. */ mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); mutex_lock_io(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); goto restart; } if (!trylock_buffer(bh)) { /* * The buffer is locked, it may be writing back, or * flushing out in the last couple of cycles, or * re-adding into a new transaction, need to check * it again until it's unlocked. */ get_bh(bh); spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); goto retry; } else if (!buffer_dirty(bh)) { unlock_buffer(bh); BUFFER_TRACE(bh, "remove from checkpoint"); /* * If the transaction was released or the checkpoint * list was empty, we're done. */ if (__jbd2_journal_remove_checkpoint(jh) || !transaction->t_checkpoint_list) goto out; } else { unlock_buffer(bh); /* * We are about to write the buffer, it could be * raced by some other transaction shrink or buffer * re-log logic once we release the j_list_lock, * leave it on the checkpoint list and check status * again to make sure it's clean. */ BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); journal->j_chkpt_bhs[batch_count++] = bh; transaction->t_chp_stats.cs_written++; transaction->t_checkpoint_list = jh->b_cpnext; } if ((batch_count == JBD2_NR_BATCH) || need_resched() || spin_needbreak(&journal->j_list_lock) || jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0]) goto unlock_and_flush; } if (batch_count) { unlock_and_flush: spin_unlock(&journal->j_list_lock); retry: if (batch_count) __flush_batch(journal, &batch_count); spin_lock(&journal->j_list_lock); goto restart; } out: spin_unlock(&journal->j_list_lock); result = jbd2_cleanup_journal_tail(journal); return (result < 0) ? result : 0; } /* * Check the list of checkpoint transactions for the journal to see if * we have already got rid of any since the last update of the log tail * in the journal superblock. If so, we can instantly roll the * superblock forward to remove those transactions from the log. * * Return <0 on error, 0 on success, 1 if there was nothing to clean up. * * Called with the journal lock held. * * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed * even in abort state, but we must not update the super block if * checkpointing may have failed. Otherwise, we would lose some metadata * buffers which should be written-back to the filesystem. */ int jbd2_cleanup_journal_tail(journal_t *journal) { tid_t first_tid; unsigned long blocknr; if (is_journal_aborted(journal)) return -EIO; if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; J_ASSERT(blocknr != 0); /* * We need to make sure that any blocks that were recently written out * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before * we drop the transactions from the journal. It's unlikely this will * be necessary, especially with an appropriately sized journal, but we * need this to guarantee correctness. Fortunately * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev); return __jbd2_update_log_tail(journal, first_tid, blocknr); } /* Checkpoint list management */ /* * journal_shrink_one_cp_list * * Find all the written-back checkpoint buffers in the given list * and try to release them. If the whole transaction is released, set * the 'released' parameter. Return the number of released checkpointed * buffers. * * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, enum jbd2_shrink_type type, bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; unsigned long nr_freed = 0; int ret; *released = false; if (!jh) return 0; last_jh = jh->b_cpprev; do { jh = next_jh; next_jh = jh->b_cpnext; if (type == JBD2_SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); if (ret < 0) { if (type == JBD2_SHRINK_BUSY_SKIP) continue; break; } } nr_freed++; if (ret) { *released = true; break; } if (need_resched()) break; } while (jh != last_jh); return nr_freed; } /* * jbd2_journal_shrink_checkpoint_list * * Find 'nr_to_scan' written-back checkpoint buffers in the journal * and try to release them. Return the number of released checkpointed * buffers. * * Called with j_list_lock held. */ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan) { transaction_t *transaction, *last_transaction, *next_transaction; bool __maybe_unused released; tid_t first_tid = 0, last_tid = 0, next_tid = 0; tid_t tid = 0; unsigned long nr_freed = 0; unsigned long freed; bool first_set = false; again: spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) { spin_unlock(&journal->j_list_lock); goto out; } /* * Get next shrink transaction, resume previous scan or start * over again. If some others do checkpoint and drop transaction * from the checkpoint list, we ignore saved j_shrink_transaction * and start over unconditionally. */ if (journal->j_shrink_transaction) transaction = journal->j_shrink_transaction; else transaction = journal->j_checkpoint_transactions; if (!first_set) { first_tid = transaction->t_tid; first_set = true; } last_transaction = journal->j_checkpoint_transactions->t_cpprev; next_transaction = transaction; last_tid = last_transaction->t_tid; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, JBD2_SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) break; if (need_resched() || spin_needbreak(&journal->j_list_lock)) break; } while (transaction != last_transaction); if (transaction != last_transaction) { journal->j_shrink_transaction = next_transaction; next_tid = next_transaction->t_tid; } else { journal->j_shrink_transaction = NULL; next_tid = 0; } spin_unlock(&journal->j_list_lock); cond_resched(); if (*nr_to_scan && journal->j_shrink_transaction) goto again; out: trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, nr_freed, next_tid); return nr_freed; } /* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function. * * Called with j_list_lock held. */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type) { transaction_t *transaction, *last_transaction, *next_transaction; bool released; WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP); transaction = journal->j_checkpoint_transactions; if (!transaction) return; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; journal_shrink_one_cp_list(transaction->t_checkpoint_list, type, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ if (need_resched()) return; /* * Stop scanning if we couldn't free the transaction. This * avoids pointless scanning of transactions which still * weren't checkpointed. */ if (!released) return; } while (transaction != last_transaction); } /* * Remove buffers from all checkpoint lists as journal is aborted and we just * need to free memory */ void jbd2_journal_destroy_checkpoint(journal_t *journal) { /* * We loop because __jbd2_journal_clean_checkpoint_list() may abort * early due to a need of rescheduling. */ while (1) { spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) { spin_unlock(&journal->j_list_lock); break; } __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY); spin_unlock(&journal->j_list_lock); cond_resched(); } } /* * journal_remove_checkpoint: called after a buffer has been committed * to disk (either by being write-back flushed to disk, or being * committed to the log). * * We cannot safely clean a transaction out of the log until all of the * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint * lists until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. * The function can free jh and bh. * * This function is called with j_list_lock held. */ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; JBUFFER_TRACE(jh, "entry"); transaction = jh->b_cp_transaction; if (!transaction) { JBUFFER_TRACE(jh, "not on transaction"); return 0; } journal = transaction->t_journal; JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; percpu_counter_dec(&journal->j_checkpoint_jh_count); jbd2_journal_put_journal_head(jh); /* Is this transaction empty? */ if (transaction->t_checkpoint_list) return 0; /* * There is one special case to worry about: if we have just pulled the * buffer off a running or committing transaction's checkpoing list, * then even if the checkpoint list is empty, the transaction obviously * cannot be dropped! * * The locking here around t_state is a bit sleazy. * See the comment at the end of jbd2_journal_commit_transaction(). */ if (transaction->t_state != T_FINISHED) return 0; /* * OK, that was the last buffer for the transaction, we can now * safely remove this transaction from the log. */ stats = &transaction->t_chp_stats; if (stats->cs_chp_time) stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, jiffies); trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); return 1; } /* * Check the checkpoint buffer and try to remove it from the checkpoint * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if * it frees the transaction, 0 otherwise. * * This function is called with j_list_lock held. */ int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); if (jh->b_transaction) return -EBUSY; if (!trylock_buffer(bh)) return -EBUSY; if (buffer_dirty(bh)) { unlock_buffer(bh); return -EBUSY; } unlock_buffer(bh); /* * Buffer is clean and the IO has finished (we held the buffer * lock) so the checkpoint is done. We can safely remove the * buffer from this transaction. */ JBUFFER_TRACE(jh, "remove from checkpoint list"); return __jbd2_journal_remove_checkpoint(jh); } /* * journal_insert_checkpoint: put a committed buffer onto a checkpoint * list so that we know when it is safe to clean the transaction out of * the log. * * Called with the journal locked. * Called with j_list_lock held. */ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, transaction_t *transaction) { JBUFFER_TRACE(jh, "entry"); J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); /* Get reference for checkpointing transaction */ jbd2_journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { jh->b_cpnext = jh->b_cpprev = jh; } else { jh->b_cpnext = transaction->t_checkpoint_list; jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; jh->b_cpprev->b_cpnext = jh; jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count); } /* * We've finished with this transaction structure: adios... * * The transaction must have no links except for the checkpoint by this * point. * * Called with the journal locked. * Called with j_list_lock held. */ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) { assert_spin_locked(&journal->j_list_lock); journal->j_shrink_transaction = NULL; if (transaction->t_cpnext) { transaction->t_cpnext->t_cpprev = transaction->t_cpprev; transaction->t_cpprev->t_cpnext = transaction->t_cpnext; if (journal->j_checkpoint_transactions == transaction) journal->j_checkpoint_transactions = transaction->t_cpnext; if (journal->j_checkpoint_transactions == transaction) journal->j_checkpoint_transactions = NULL; } J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); trace_jbd2_drop_transaction(journal, transaction); jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); }
200 3 310 40 35 3 5 3 3 211 211 1 1 3 9 1 52 51 49 1 8 310 24 308 12 307 244 8 9 8 1 4 5 5 15 15 15 9 9 209 211 4 29 201 3 3 3 38 117 1 9 9 21 107 97 211 102 165 92 91 92 45 8 53 181 1 52 17 53 53 16 45 2 13 53 53 53 23 27 53 183 5 180 180 1 181 93 1 179 1 180 7 177 180 182 10 115 155 155 5 1 149 6 3 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * TODO: try to use extents tree (instead of array) */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/log2.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* runs_tree is a continues memory. Try to avoid big size. */ #define NTFS3_RUN_MAX_BYTES 0x10000 struct ntfs_run { CLST vcn; /* Virtual cluster number. */ CLST len; /* Length in clusters. */ CLST lcn; /* Logical cluster number. */ }; /* * run_lookup - Lookup the index of a MCB entry that is first <= vcn. * * Case of success it will return non-zero value and set * @index parameter to index of entry been found. * Case of entry missing from list 'index' will be set to * point to insertion position for the entry question. */ static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index) { size_t min_idx, max_idx, mid_idx; struct ntfs_run *r; if (!run->count) { *index = 0; return false; } min_idx = 0; max_idx = run->count - 1; /* Check boundary cases specially, 'cause they cover the often requests. */ r = run->runs; if (vcn < r->vcn) { *index = 0; return false; } if (vcn < r->vcn + r->len) { *index = 0; return true; } r += max_idx; if (vcn >= r->vcn + r->len) { *index = run->count; return false; } if (vcn >= r->vcn) { *index = max_idx; return true; } do { mid_idx = min_idx + ((max_idx - min_idx) >> 1); r = run->runs + mid_idx; if (vcn < r->vcn) { max_idx = mid_idx - 1; if (!mid_idx) break; } else if (vcn >= r->vcn + r->len) { min_idx = mid_idx + 1; } else { *index = mid_idx; return true; } } while (min_idx <= max_idx); *index = max_idx + 1; return false; } /* * run_consolidate - Consolidate runs starting from a given one. */ static void run_consolidate(struct runs_tree *run, size_t index) { size_t i; struct ntfs_run *r = run->runs + index; while (index + 1 < run->count) { /* * I should merge current run with next * if start of the next run lies inside one being tested. */ struct ntfs_run *n = r + 1; CLST end = r->vcn + r->len; CLST dl; /* Stop if runs are not aligned one to another. */ if (n->vcn > end) break; dl = end - n->vcn; /* * If range at index overlaps with next one * then I will either adjust it's start position * or (if completely matches) dust remove one from the list. */ if (dl > 0) { if (n->len <= dl) goto remove_next_range; n->len -= dl; n->vcn += dl; if (n->lcn != SPARSE_LCN) n->lcn += dl; dl = 0; } /* * Stop if sparse mode does not match * both current and next runs. */ if ((n->lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) { index += 1; r = n; continue; } /* * Check if volume block * of a next run lcn does not match * last volume block of the current run. */ if (n->lcn != SPARSE_LCN && n->lcn != r->lcn + r->len) break; /* * Next and current are siblings. * Eat/join. */ r->len += n->len - dl; remove_next_range: i = run->count - (index + 1); if (i > 1) memmove(n, n + 1, sizeof(*n) * (i - 1)); run->count -= 1; } } /* * run_is_mapped_full * * Return: True if range [svcn - evcn] is mapped. */ bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn) { size_t i; const struct ntfs_run *r, *end; CLST next_vcn; if (!run_lookup(run, svcn, &i)) return false; end = run->runs + run->count; r = run->runs + i; for (;;) { next_vcn = r->vcn + r->len; if (next_vcn > evcn) return true; if (++r >= end) return false; if (r->vcn != next_vcn) return false; } } bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, CLST *len, size_t *index) { size_t idx; CLST gap; struct ntfs_run *r; /* Fail immediately if nrun was not touched yet. */ if (!run->runs) return false; if (!run_lookup(run, vcn, &idx)) return false; r = run->runs + idx; if (vcn >= r->vcn + r->len) return false; gap = vcn - r->vcn; if (r->len <= gap) return false; *lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + gap); if (len) *len = r->len - gap; if (index) *index = idx; return true; } /* * run_truncate_head - Decommit the range before vcn. */ void run_truncate_head(struct runs_tree *run, CLST vcn) { size_t index; struct ntfs_run *r; if (run_lookup(run, vcn, &index)) { r = run->runs + index; if (vcn > r->vcn) { CLST dlen = vcn - r->vcn; r->vcn = vcn; r->len -= dlen; if (r->lcn != SPARSE_LCN) r->lcn += dlen; } if (!index) return; } r = run->runs; memmove(r, r + index, sizeof(*r) * (run->count - index)); run->count -= index; if (!run->count) { kvfree(run->runs); run->runs = NULL; run->allocated = 0; } } /* * run_truncate - Decommit the range after vcn. */ void run_truncate(struct runs_tree *run, CLST vcn) { size_t index; /* * If I hit the range then * I have to truncate one. * If range to be truncated is becoming empty * then it will entirely be removed. */ if (run_lookup(run, vcn, &index)) { struct ntfs_run *r = run->runs + index; r->len = vcn - r->vcn; if (r->len > 0) index += 1; } /* * At this point 'index' is set to position that * should be thrown away (including index itself) * Simple one - just set the limit. */ run->count = index; /* Do not reallocate array 'runs'. Only free if possible. */ if (!index) { kvfree(run->runs); run->runs = NULL; run->allocated = 0; } } /* * run_truncate_around - Trim head and tail if necessary. */ void run_truncate_around(struct runs_tree *run, CLST vcn) { run_truncate_head(run, vcn); if (run->count >= NTFS3_RUN_MAX_BYTES / sizeof(struct ntfs_run) / 2) run_truncate(run, (run->runs + (run->count >> 1))->vcn); } /* * run_add_entry * * Sets location to known state. * Run to be added may overlap with existing location. * * Return: false if of memory. */ bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, bool is_mft) { size_t used, index; struct ntfs_run *r; bool inrange; CLST tail_vcn = 0, tail_len = 0, tail_lcn = 0; bool should_add_tail = false; /* * Lookup the insertion point. * * Execute bsearch for the entry containing * start position question. */ inrange = run_lookup(run, vcn, &index); /* * Shortcut here would be case of * range not been found but one been added * continues previous run. * This case I can directly make use of * existing range as my start point. */ if (!inrange && index > 0) { struct ntfs_run *t = run->runs + index - 1; if (t->vcn + t->len == vcn && (t->lcn == SPARSE_LCN) == (lcn == SPARSE_LCN) && (lcn == SPARSE_LCN || lcn == t->lcn + t->len)) { inrange = true; index -= 1; } } /* * At this point 'index' either points to the range * containing start position or to the insertion position * for a new range. * So first let's check if range I'm probing is here already. */ if (!inrange) { requires_new_range: /* * Range was not found. * Insert at position 'index' */ used = run->count * sizeof(struct ntfs_run); /* * Check allocated space. * If one is not enough to get one more entry * then it will be reallocated. */ if (run->allocated < used + sizeof(struct ntfs_run)) { size_t bytes; struct ntfs_run *new_ptr; /* Use power of 2 for 'bytes'. */ if (!used) { bytes = 64; } else if (used <= 16 * PAGE_SIZE) { if (is_power_of_2(run->allocated)) bytes = run->allocated << 1; else bytes = (size_t)1 << (2 + blksize_bits(used)); } else { bytes = run->allocated + (16 * PAGE_SIZE); } WARN_ON(!is_mft && bytes > NTFS3_RUN_MAX_BYTES); new_ptr = kvmalloc(bytes, GFP_KERNEL); if (!new_ptr) return false; r = new_ptr + index; memcpy(new_ptr, run->runs, index * sizeof(struct ntfs_run)); memcpy(r + 1, run->runs + index, sizeof(struct ntfs_run) * (run->count - index)); kvfree(run->runs); run->runs = new_ptr; run->allocated = bytes; } else { size_t i = run->count - index; r = run->runs + index; /* memmove appears to be a bottle neck here... */ if (i > 0) memmove(r + 1, r, sizeof(struct ntfs_run) * i); } r->vcn = vcn; r->lcn = lcn; r->len = len; run->count += 1; } else { r = run->runs + index; /* * If one of ranges was not allocated then we * have to split location we just matched and * insert current one. * A common case this requires tail to be reinserted * a recursive call. */ if (((lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) || (lcn != SPARSE_LCN && lcn != r->lcn + (vcn - r->vcn))) { CLST to_eat = vcn - r->vcn; CLST Tovcn = to_eat + len; should_add_tail = Tovcn < r->len; if (should_add_tail) { tail_lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + Tovcn); tail_vcn = r->vcn + Tovcn; tail_len = r->len - Tovcn; } if (to_eat > 0) { r->len = to_eat; inrange = false; index += 1; goto requires_new_range; } /* lcn should match one were going to add. */ r->lcn = lcn; } /* * If existing range fits then were done. * Otherwise extend found one and fall back to range jocode. */ if (r->vcn + r->len < vcn + len) r->len += len - ((r->vcn + r->len) - vcn); } /* * And normalize it starting from insertion point. * It's possible that no insertion needed case if * start point lies within the range of an entry * that 'index' points to. */ if (inrange && index > 0) index -= 1; run_consolidate(run, index); run_consolidate(run, index + 1); /* * A special case. * We have to add extra range a tail. */ if (should_add_tail && !run_add_entry(run, tail_vcn, tail_lcn, tail_len, is_mft)) return false; return true; } /* run_collapse_range * * Helper for attr_collapse_range(), * which is helper for fallocate(collapse_range). */ bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len) { size_t index, eat; struct ntfs_run *r, *e, *eat_start, *eat_end; CLST end; if (WARN_ON(!run_lookup(run, vcn, &index))) return true; /* Should never be here. */ e = run->runs + run->count; r = run->runs + index; end = vcn + len; if (vcn > r->vcn) { if (r->vcn + r->len <= end) { /* Collapse tail of run .*/ r->len = vcn - r->vcn; } else if (r->lcn == SPARSE_LCN) { /* Collapse a middle part of sparsed run. */ r->len -= len; } else { /* Collapse a middle part of normal run, split. */ if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) return false; return run_collapse_range(run, vcn, len); } r += 1; } eat_start = r; eat_end = r; for (; r < e; r++) { CLST d; if (r->vcn >= end) { r->vcn -= len; continue; } if (r->vcn + r->len <= end) { /* Eat this run. */ eat_end = r + 1; continue; } d = end - r->vcn; if (r->lcn != SPARSE_LCN) r->lcn += d; r->len -= d; r->vcn -= len - d; } eat = eat_end - eat_start; memmove(eat_start, eat_end, (e - eat_end) * sizeof(*r)); run->count -= eat; return true; } /* run_insert_range * * Helper for attr_insert_range(), * which is helper for fallocate(insert_range). */ bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len) { size_t index; struct ntfs_run *r, *e; if (WARN_ON(!run_lookup(run, vcn, &index))) return false; /* Should never be here. */ e = run->runs + run->count; r = run->runs + index; if (vcn > r->vcn) r += 1; for (; r < e; r++) r->vcn += len; r = run->runs + index; if (vcn > r->vcn) { /* split fragment. */ CLST len1 = vcn - r->vcn; CLST len2 = r->len - len1; CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1); r->len = len1; if (!run_add_entry(run, vcn + len, lcn2, len2, false)) return false; } if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) return false; return true; } /* * run_get_entry - Return index-th mapped region. */ bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, CLST *lcn, CLST *len) { const struct ntfs_run *r; if (index >= run->count) return false; r = run->runs + index; if (!r->len) return false; if (vcn) *vcn = r->vcn; if (lcn) *lcn = r->lcn; if (len) *len = r->len; return true; } /* * run_packed_size - Calculate the size of packed int64. */ #ifdef __BIG_ENDIAN static inline int run_packed_size(const s64 n) { const u8 *p = (const u8 *)&n + sizeof(n) - 1; if (n >= 0) { if (p[-7] || p[-6] || p[-5] || p[-4]) p -= 4; if (p[-3] || p[-2]) p -= 2; if (p[-1]) p -= 1; if (p[0] & 0x80) p -= 1; } else { if (p[-7] != 0xff || p[-6] != 0xff || p[-5] != 0xff || p[-4] != 0xff) p -= 4; if (p[-3] != 0xff || p[-2] != 0xff) p -= 2; if (p[-1] != 0xff) p -= 1; if (!(p[0] & 0x80)) p -= 1; } return (const u8 *)&n + sizeof(n) - p; } /* Full trusted function. It does not check 'size' for errors. */ static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) { const u8 *p = (u8 *)&v; switch (size) { case 8: run_buf[7] = p[0]; fallthrough; case 7: run_buf[6] = p[1]; fallthrough; case 6: run_buf[5] = p[2]; fallthrough; case 5: run_buf[4] = p[3]; fallthrough; case 4: run_buf[3] = p[4]; fallthrough; case 3: run_buf[2] = p[5]; fallthrough; case 2: run_buf[1] = p[6]; fallthrough; case 1: run_buf[0] = p[7]; } } /* Full trusted function. It does not check 'size' for errors. */ static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) { u8 *p = (u8 *)&v; switch (size) { case 8: p[0] = run_buf[7]; fallthrough; case 7: p[1] = run_buf[6]; fallthrough; case 6: p[2] = run_buf[5]; fallthrough; case 5: p[3] = run_buf[4]; fallthrough; case 4: p[4] = run_buf[3]; fallthrough; case 3: p[5] = run_buf[2]; fallthrough; case 2: p[6] = run_buf[1]; fallthrough; case 1: p[7] = run_buf[0]; } return v; } #else static inline int run_packed_size(const s64 n) { const u8 *p = (const u8 *)&n; if (n >= 0) { if (p[7] || p[6] || p[5] || p[4]) p += 4; if (p[3] || p[2]) p += 2; if (p[1]) p += 1; if (p[0] & 0x80) p += 1; } else { if (p[7] != 0xff || p[6] != 0xff || p[5] != 0xff || p[4] != 0xff) p += 4; if (p[3] != 0xff || p[2] != 0xff) p += 2; if (p[1] != 0xff) p += 1; if (!(p[0] & 0x80)) p += 1; } return 1 + p - (const u8 *)&n; } /* Full trusted function. It does not check 'size' for errors. */ static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) { const u8 *p = (u8 *)&v; /* memcpy( run_buf, &v, size); Is it faster? */ switch (size) { case 8: run_buf[7] = p[7]; fallthrough; case 7: run_buf[6] = p[6]; fallthrough; case 6: run_buf[5] = p[5]; fallthrough; case 5: run_buf[4] = p[4]; fallthrough; case 4: run_buf[3] = p[3]; fallthrough; case 3: run_buf[2] = p[2]; fallthrough; case 2: run_buf[1] = p[1]; fallthrough; case 1: run_buf[0] = p[0]; } } /* full trusted function. It does not check 'size' for errors */ static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) { u8 *p = (u8 *)&v; /* memcpy( &v, run_buf, size); Is it faster? */ switch (size) { case 8: p[7] = run_buf[7]; fallthrough; case 7: p[6] = run_buf[6]; fallthrough; case 6: p[5] = run_buf[5]; fallthrough; case 5: p[4] = run_buf[4]; fallthrough; case 4: p[3] = run_buf[3]; fallthrough; case 3: p[2] = run_buf[2]; fallthrough; case 2: p[1] = run_buf[1]; fallthrough; case 1: p[0] = run_buf[0]; } return v; } #endif /* * run_pack - Pack runs into buffer. * * packed_vcns - How much runs we have packed. * packed_size - How much bytes we have used run_buf. */ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, u32 run_buf_size, CLST *packed_vcns) { CLST next_vcn, vcn, lcn; CLST prev_lcn = 0; CLST evcn1 = svcn + len; const struct ntfs_run *r, *r_end; int packed_size = 0; size_t i; s64 dlcn; int offset_size, size_size, tmp; *packed_vcns = 0; if (!len) goto out; /* Check all required entries [svcn, encv1) available. */ if (!run_lookup(run, svcn, &i)) return -ENOENT; r_end = run->runs + run->count; r = run->runs + i; for (next_vcn = r->vcn + r->len; next_vcn < evcn1; next_vcn = r->vcn + r->len) { if (++r >= r_end || r->vcn != next_vcn) return -ENOENT; } /* Repeat cycle above and pack runs. Assume no errors. */ r = run->runs + i; len = svcn - r->vcn; vcn = svcn; lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len); len = r->len - len; for (;;) { next_vcn = vcn + len; if (next_vcn > evcn1) len = evcn1 - vcn; /* How much bytes required to pack len. */ size_size = run_packed_size(len); /* offset_size - How much bytes is packed dlcn. */ if (lcn == SPARSE_LCN) { offset_size = 0; dlcn = 0; } else { /* NOTE: lcn can be less than prev_lcn! */ dlcn = (s64)lcn - prev_lcn; offset_size = run_packed_size(dlcn); prev_lcn = lcn; } tmp = run_buf_size - packed_size - 2 - offset_size; if (tmp <= 0) goto out; /* Can we store this entire run. */ if (tmp < size_size) goto out; if (run_buf) { /* Pack run header. */ run_buf[0] = ((u8)(size_size | (offset_size << 4))); run_buf += 1; /* Pack the length of run. */ run_pack_s64(run_buf, size_size, len); run_buf += size_size; /* Pack the offset from previous LCN. */ run_pack_s64(run_buf, offset_size, dlcn); run_buf += offset_size; } packed_size += 1 + offset_size + size_size; *packed_vcns += len; if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1) goto out; r += 1; vcn = r->vcn; lcn = r->lcn; len = r->len; } out: /* Store last zero. */ if (run_buf) run_buf[0] = 0; return packed_size + 1; } /* * run_unpack - Unpack packed runs from @run_buf. * * Return: Error if negative, or real used bytes. */ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size) { u64 prev_lcn, vcn64, lcn, next_vcn; const u8 *run_last, *run_0; bool is_mft = ino == MFT_REC_MFT; if (run_buf_size < 0) return -EINVAL; /* Check for empty. */ if (evcn + 1 == svcn) return 0; if (evcn < svcn) return -EINVAL; run_0 = run_buf; run_last = run_buf + run_buf_size; prev_lcn = 0; vcn64 = svcn; /* Read all runs the chain. */ /* size_size - How much bytes is packed len. */ while (run_buf < run_last) { /* size_size - How much bytes is packed len. */ u8 size_size = *run_buf & 0xF; /* offset_size - How much bytes is packed dlcn. */ u8 offset_size = *run_buf++ >> 4; u64 len; if (!size_size) break; /* * Unpack runs. * NOTE: Runs are stored little endian order * "len" is unsigned value, "dlcn" is signed. * Large positive number requires to store 5 bytes * e.g.: 05 FF 7E FF FF 00 00 00 */ if (size_size > sizeof(len)) return -EINVAL; len = run_unpack_s64(run_buf, size_size, 0); /* Skip size_size. */ run_buf += size_size; if (!len) return -EINVAL; if (!offset_size) lcn = SPARSE_LCN64; else if (offset_size <= sizeof(s64)) { s64 dlcn; /* Initial value of dlcn is -1 or 0. */ dlcn = (run_buf[offset_size - 1] & 0x80) ? (s64)-1 : 0; dlcn = run_unpack_s64(run_buf, offset_size, dlcn); /* Skip offset_size. */ run_buf += offset_size; if (!dlcn) return -EINVAL; lcn = prev_lcn + dlcn; prev_lcn = lcn; } else { /* The size of 'dlcn' can't be > 8. */ return -EINVAL; } next_vcn = vcn64 + len; /* Check boundary. */ if (next_vcn > evcn + 1) return -EINVAL; #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (next_vcn > 0x100000000ull || (lcn + len) > 0x100000000ull) { ntfs_err( sbi->sb, "This driver is compiled without CONFIG_NTFS3_64BIT_CLUSTER (like windows driver).\n" "Volume contains 64 bits run: vcn %llx, lcn %llx, len %llx.\n" "Activate CONFIG_NTFS3_64BIT_CLUSTER to process this case", vcn64, lcn, len); return -EOPNOTSUPP; } #endif if (lcn != SPARSE_LCN64 && lcn + len > sbi->used.bitmap.nbits) { /* LCN range is out of volume. */ return -EINVAL; } if (!run) ; /* Called from check_attr(fslog.c) to check run. */ else if (run == RUN_DEALLOCATE) { /* * Called from ni_delete_all to free clusters * without storing in run. */ if (lcn != SPARSE_LCN64) mark_as_free_ex(sbi, lcn, len, true); } else if (vcn64 >= vcn) { if (!run_add_entry(run, vcn64, lcn, len, is_mft)) return -ENOMEM; } else if (next_vcn > vcn) { u64 dlen = vcn - vcn64; if (!run_add_entry(run, vcn, lcn + dlen, len - dlen, is_mft)) return -ENOMEM; } vcn64 = next_vcn; } if (vcn64 != evcn + 1) { /* Not expected length of unpacked runs. */ return -EINVAL; } return run_buf - run_0; } #ifdef NTFS3_CHECK_FREE_CLST /* * run_unpack_ex - Unpack packed runs from "run_buf". * * Checks unpacked runs to be used in bitmap. * * Return: Error if negative, or real used bytes. */ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, int run_buf_size) { int ret, err; CLST next_vcn, lcn, len; size_t index, done; bool ok, zone; struct wnd_bitmap *wnd; ret = run_unpack(run, sbi, ino, svcn, evcn, vcn, run_buf, run_buf_size); if (ret <= 0) return ret; if (!sbi->used.bitmap.sb || !run || run == RUN_DEALLOCATE) return ret; if (ino == MFT_REC_BADCLUST) return ret; next_vcn = vcn = svcn; wnd = &sbi->used.bitmap; for (ok = run_lookup_entry(run, vcn, &lcn, &len, &index); next_vcn <= evcn; ok = run_get_entry(run, ++index, &vcn, &lcn, &len)) { if (!ok || next_vcn != vcn) return -EINVAL; next_vcn = vcn + len; if (lcn == SPARSE_LCN) continue; if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) continue; down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); zone = max(wnd->zone_bit, lcn) < min(wnd->zone_end, lcn + len); /* Check for free blocks. */ ok = !zone && wnd_is_used(wnd, lcn, len); up_read(&wnd->rw_lock); if (ok) continue; /* Looks like volume is corrupted. */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); if (!down_write_trylock(&wnd->rw_lock)) continue; if (zone) { /* * Range [lcn, lcn + len) intersects with zone. * To avoid complex with zone just turn it off. */ wnd_zone_set(wnd, 0, 0); } /* Mark all zero bits as used in range [lcn, lcn+len). */ err = wnd_set_used_safe(wnd, lcn, len, &done); if (zone) { /* Restore zone. Lock mft run. */ struct rw_semaphore *lock = is_mounted(sbi) ? &sbi->mft.ni->file.run_lock : NULL; if (lock) down_read(lock); ntfs_refresh_zone(sbi); if (lock) up_read(lock); } up_write(&wnd->rw_lock); if (err) return err; } return ret; } #endif /* * run_get_highest_vcn * * Return the highest vcn from a mapping pairs array * it used while replaying log file. */ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) { u64 vcn64 = vcn; u8 size_size; while ((size_size = *run_buf & 0xF)) { u8 offset_size = *run_buf++ >> 4; u64 len; if (size_size > 8 || offset_size > 8) return -EINVAL; len = run_unpack_s64(run_buf, size_size, 0); if (!len) return -EINVAL; run_buf += size_size + offset_size; vcn64 += len; #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (vcn64 > 0x100000000ull) return -EINVAL; #endif } *highest_vcn = vcn64 - 1; return 0; } /* * run_clone * * Make a copy of run */ int run_clone(const struct runs_tree *run, struct runs_tree *new_run) { size_t bytes = run->count * sizeof(struct ntfs_run); if (bytes > new_run->allocated) { struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL); if (!new_ptr) return -ENOMEM; kvfree(new_run->runs); new_run->runs = new_ptr; new_run->allocated = bytes; } memcpy(new_run->runs, run->runs, bytes); new_run->count = run->count; return 0; }
154 152 76 82 25 66 78 9 2 67 75 2 3 71 11 67 18 18 2 16 4 3 1 1 4 4 142 152 133 79 1 74 2 3 63 4 36 67 66 2 64 86 10 67 89 1 89 65 89 64 64 79 18 2 154 108 2 89 116 3 7 74 2 75 152 1 152 91 6 131 6 128 77 153 105 75 45 28 12 32 4 4 4 3 2 2 2 2 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 // SPDX-License-Identifier: GPL-2.0-only /* * vfsv0 quota IO operations on file */ #include <linux/errno.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/dqblk_v2.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/quotaops.h> #include <asm/byteorder.h> #include "quota_tree.h" MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota trie support"); MODULE_LICENSE("GPL"); /* * Maximum quota tree depth we support. Only to limit recursion when working * with the tree. */ #define MAX_QTREE_DEPTH 6 #define __QUOTA_QT_PARANOIA static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) { unsigned int epb = info->dqi_usable_bs >> 2; depth = info->dqi_qtree_depth - depth - 1; while (depth--) id /= epb; return id % epb; } static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth) { qid_t id = from_kqid(&init_user_ns, qid); return __get_index(info, id, depth); } /* Number of entries in one blocks */ static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) { return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) / info->dqi_entry_size; } static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; memset(buf, 0, info->dqi_usable_bs); return sb->s_op->quota_read(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); } static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; ssize_t ret; ret = sb->s_op->quota_write(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -EIO; } return ret; } static inline int do_check_range(struct super_block *sb, const char *val_name, uint val, uint min_val, uint max_val) { if (val < min_val || val > max_val) { quota_error(sb, "Getting %s %u out of range %u-%u", val_name, val, min_val, max_val); return -EUCLEAN; } return 0; } static int check_dquot_block_header(struct qtree_mem_dqinfo *info, struct qt_disk_dqdbheader *dh) { int err = 0; err = do_check_range(info->dqi_sb, "dqdh_next_free", le32_to_cpu(dh->dqdh_next_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_prev_free", le32_to_cpu(dh->dqdh_prev_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_entries", le16_to_cpu(dh->dqdh_entries), 0, qtree_dqstr_in_blk(info)); return err; } /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int ret, blk; if (!buf) return -ENOMEM; if (info->dqi_free_blk) { blk = info->dqi_free_blk; ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { memset(buf, 0, info->dqi_usable_bs); /* Assure block allocation... */ ret = write_blk(info, info->dqi_blocks, buf); if (ret < 0) goto out_buf; blk = info->dqi_blocks++; } mark_info_dirty(info->dqi_sb, info->dqi_type); ret = blk; out_buf: kfree(buf); return ret; } /* Insert empty block to the list */ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) { struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); dh->dqdh_prev_free = cpu_to_le32(0); dh->dqdh_entries = cpu_to_le16(0); err = write_blk(info, blk, buf); if (err < 0) return err; info->dqi_free_blk = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; } /* Remove given block from the list of blocks with free entries */ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; uint nextblk = le32_to_cpu(dh->dqdh_next_free); uint prevblk = le32_to_cpu(dh->dqdh_prev_free); int err; if (!tmpbuf) return -ENOMEM; if (nextblk) { err = read_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; err = write_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; } if (prevblk) { err = read_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; err = write_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; } else { info->dqi_free_entry = nextblk; mark_info_dirty(info->dqi_sb, info->dqi_type); } kfree(tmpbuf); dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); /* No matter whether write succeeds block is out of list */ if (write_blk(info, blk, buf) < 0) quota_error(info->dqi_sb, "Can't write block (%u) " "with free entries", blk); return 0; out_buf: kfree(tmpbuf); return err; } /* Insert given block to the beginning of list with free entries */ static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; if (!tmpbuf) return -ENOMEM; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); dh->dqdh_prev_free = cpu_to_le32(0); err = write_blk(info, blk, buf); if (err < 0) goto out_buf; if (info->dqi_free_entry) { err = read_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); err = write_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; } kfree(tmpbuf); info->dqi_free_entry = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; out_buf: kfree(tmpbuf); return err; } /* Is the entry in the block free? */ int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) { int i; for (i = 0; i < info->dqi_entry_size; i++) if (disk[i]) return 0; return 1; } EXPORT_SYMBOL(qtree_entry_unused); /* Find space for dquot */ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, int *err) { uint blk, i; struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); char *ddquot; *err = 0; if (!buf) { *err = -ENOMEM; return 0; } dh = (struct qt_disk_dqdbheader *)buf; if (info->dqi_free_entry) { blk = info->dqi_free_entry; *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; *err = check_dquot_block_header(info, dh); if (*err) goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { *err = blk; kfree(buf); return 0; } memset(buf, 0, info->dqi_usable_bs); /* This is enough as the block is already zeroed and the entry * list is empty... */ info->dqi_free_entry = blk; mark_info_dirty(dquot->dq_sb, dquot->dq_id.type); } /* Block will be full? */ if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { *err = remove_free_dqentry(info, buf, blk); if (*err < 0) { quota_error(dquot->dq_sb, "Can't remove block (%u) " "from entry free list", blk); goto out_buf; } } le16_add_cpu(&dh->dqdh_entries, 1); /* Find free structure in block */ ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (qtree_entry_unused(info, ddquot)) break; ddquot += info->dqi_entry_size; } #ifdef __QUOTA_QT_PARANOIA if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Data block full but it shouldn't"); *err = -EIO; goto out_buf; } #endif *err = write_blk(info, blk, buf); if (*err < 0) { quota_error(dquot->dq_sb, "Can't write quota data block %u", blk); goto out_buf; } dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; kfree(buf); return blk; out_buf: kfree(buf); return 0; } /* Insert reference to structure into the trie */ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0, newson = 0, newact = 0; __le32 *ref; uint newblk; int i; if (!buf) return -ENOMEM; if (!blks[depth]) { ret = get_free_dqblk(info); if (ret < 0) goto out_buf; for (i = 0; i < depth; i++) if (ret == blks[i]) { quota_error(dquot->dq_sb, "Free block already used in tree: block %u", ret); ret = -EIO; goto out_buf; } blks[depth] = ret; memset(buf, 0, info->dqi_usable_bs); newact = 1; } else { ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read tree quota " "block %u", blks[depth]); goto out_buf; } } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (!newblk) { newson = 1; } else { for (i = 0; i <= depth; i++) if (newblk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } } blks[depth + 1] = newblk; if (depth == info->dqi_qtree_depth - 1) { #ifdef __QUOTA_QT_PARANOIA if (newblk) { quota_error(dquot->dq_sb, "Inserting already present " "quota entry (block %u)", le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)])); ret = -EIO; goto out_buf; } #endif blks[depth + 1] = find_free_dqentry(info, dquot, &ret); } else { ret = do_insert_tree(info, dquot, blks, depth + 1); } if (newson && ret >= 0) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(blks[depth + 1]); ret = write_blk(info, blks[depth], buf); } else if (newact && ret < 0) { put_free_dqblk(info, buf, blks[depth]); } out_buf: kfree(buf); return ret; } /* Wrapper for inserting quota structure into tree */ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; #ifdef __QUOTA_QT_PARANOIA if (info->dqi_blocks <= QT_TREEOFF) { quota_error(dquot->dq_sb, "Quota tree root isn't allocated!"); return -EIO; } #endif if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return do_insert_tree(info, dquot, blks, 0); } /* * We don't have to be afraid of deadlocks as we never have quotas on quota * files... */ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; ssize_t ret; char *ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL); if (!ddquot) return -ENOMEM; /* dq_off is guarded by dqio_sem */ if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { quota_error(sb, "Error %zd occurred while creating " "quota", ret); kfree(ddquot); return ret; } } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->mem2disk_dqblk(ddquot, dquot); spin_unlock(&dquot->dq_dqb_lock); ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -ENOSPC; } else { ret = 0; } dqstats_inc(DQST_WRITES); kfree(ddquot); return ret; } EXPORT_SYMBOL(qtree_write_dquot); /* Free dquot entry in data block */ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0; if (!buf) return -ENOMEM; if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", blk); goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk); if (ret >= 0) ret = put_free_dqblk(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't move quota data block " "(%u) to free list", blk); goto out_buf; } } else { memset(buf + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), 0, info->dqi_entry_size); if (le16_to_cpu(dh->dqdh_entries) == qtree_dqstr_in_blk(info) - 1) { /* Insert will write block itself */ ret = insert_free_dqentry(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't insert quota " "data block (%u) to free entry list", blk); goto out_buf; } } else { ret = write_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't write quota " "data block %u", blk); goto out_buf; } } } dquot->dq_off = 0; /* Quota is now unattached */ out_buf: kfree(buf); return ret; } /* Remove reference to dquot from tree */ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); int ret = 0; uint newblk; __le32 *ref = (__le32 *)buf; int i; if (!buf) return -ENOMEM; ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", blks[depth]); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; for (i = 0; i <= depth; i++) if (newblk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); blks[depth + 1] = 0; } else { blks[depth + 1] = newblk; ret = remove_tree(info, dquot, blks, depth + 1); } if (ret >= 0 && !blks[depth + 1]) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); /* Block got empty? */ for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++) ; /* Don't put the root block into the free block list */ if (i == (info->dqi_usable_bs >> 2) && blks[depth] != QT_TREEOFF) { put_free_dqblk(info, buf, blks[depth]); blks[depth] = 0; } else { ret = write_blk(info, blks[depth], buf); if (ret < 0) quota_error(dquot->dq_sb, "Can't write quota tree block %u", blks[depth]); } } out_buf: kfree(buf); return ret; } /* Delete dquot from tree */ int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; if (!dquot->dq_off) /* Even not allocated? */ return 0; if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return remove_tree(info, dquot, blks, 0); } EXPORT_SYMBOL(qtree_delete_dquot); /* Find entry in block */ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); loff_t ret = 0; int i; char *ddquot; if (!buf) return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree " "block %u", blk); goto out_buf; } ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (info->dqi_ops->is_id(ddquot, dquot)) break; ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Quota for id %u referenced but not present", from_kqid(&init_user_ns, dquot->dq_id)); ret = -EIO; goto out_buf; } else { ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; } out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree */ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blks, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); loff_t ret = 0; __le32 *ref = (__le32 *)buf; uint blk; int i; if (!buf) return -ENOMEM; ret = read_blk(info, blks[depth], buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree block %u", blks[depth]); goto out_buf; } ret = 0; blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; /* Check for cycles in the tree */ for (i = 0; i <= depth; i++) if (blk == blks[i]) { quota_error(dquot->dq_sb, "Cycle in quota tree detected: block %u index %u", blks[depth], get_index(info, dquot->dq_id, depth)); ret = -EIO; goto out_buf; } blks[depth + 1] = blk; if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blks, depth + 1); else ret = find_block_dqentry(info, dquot, blk); out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree - wrapper function */ static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF }; if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) { quota_error(dquot->dq_sb, "Quota tree depth too big!"); return -EIO; } return find_tree_dqentry(info, dquot, blks, 0); } int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; loff_t offset; char *ddquot; int ret = 0; #ifdef __QUOTA_QT_PARANOIA /* Invalidated quota? */ if (!sb_dqopt(dquot->dq_sb)->files[type]) { quota_error(sb, "Quota invalidated while reading!"); return -EIO; } #endif /* Do we know offset of the dquot entry in the quota file? */ if (!dquot->dq_off) { offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) quota_error(sb,"Can't read quota structure " "for id %u", from_kqid(&init_user_ns, dquot->dq_id)); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); ret = offset; goto out; } dquot->dq_off = offset; } ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL); if (!ddquot) return -ENOMEM; ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { if (ret >= 0) ret = -EIO; quota_error(sb, "Error while reading quota structure for id %u", from_kqid(&init_user_ns, dquot->dq_id)); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); kfree(ddquot); goto out; } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->disk2mem_dqblk(dquot, ddquot); if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); kfree(ddquot); out: dqstats_inc(DQST_READS); return ret; } EXPORT_SYMBOL(qtree_read_dquot); /* Check whether dquot should not be deleted. We know we are * the only one operating on dquot (thanks to dq_lock) */ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) return qtree_delete_dquot(info, dquot); return 0; } EXPORT_SYMBOL(qtree_release_dquot); static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, unsigned int blk, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL); __le32 *ref = (__le32 *)buf; ssize_t ret; unsigned int epb = info->dqi_usable_bs >> 2; unsigned int level_inc = 1; int i; if (!buf) return -ENOMEM; for (i = depth; i < info->dqi_qtree_depth - 1; i++) level_inc *= epb; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(info->dqi_sb, "Can't read quota tree block %u", blk); goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { uint blk_no = le32_to_cpu(ref[i]); if (blk_no == 0) { *id += level_inc; continue; } ret = do_check_range(info->dqi_sb, "block", blk_no, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = 0; goto out_buf; } ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; } if (i == epb) { ret = -ENOENT; goto out_buf; } out_buf: kfree(buf); return ret; } int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid) { qid_t id = from_kqid(&init_user_ns, *qid); int ret; ret = find_next_id(info, &id, QT_TREEOFF, 0); if (ret < 0) return ret; *qid = make_kqid(&init_user_ns, qid->type, id); return 0; } EXPORT_SYMBOL(qtree_get_next_id);
12 60 1 56 2 58 27 27 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 // SPDX-License-Identifier: GPL-2.0 #include <linux/buffer_head.h> #include "minix.h" enum {DIRECT = 7, DEPTH = 4}; /* Have triple indirect */ typedef u32 block_t; /* 32 bit, host order */ static inline unsigned long block_to_cpu(block_t n) { return n; } static inline block_t cpu_to_block(unsigned long n) { return n; } static inline block_t *i_data(struct inode *inode) { return (block_t *)minix_i(inode)->u.i2_data; } #define DIRCOUNT 7 #define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2)) static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; struct super_block *sb = inode->i_sb; if (block < 0) { printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", block, sb->s_bdev); return 0; } if ((u64)block * (u64)sb->s_blocksize >= sb->s_maxbytes) return 0; if (block < DIRCOUNT) { offsets[n++] = block; } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { offsets[n++] = DIRCOUNT; offsets[n++] = block; } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) { offsets[n++] = DIRCOUNT + 1; offsets[n++] = block / INDIRCOUNT(sb); offsets[n++] = block % INDIRCOUNT(sb); } else { block -= INDIRCOUNT(sb) * INDIRCOUNT(sb); offsets[n++] = DIRCOUNT + 2; offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb); offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb); offsets[n++] = block % INDIRCOUNT(sb); } return n; } #include "itree_common.c" int V2_minix_get_block(struct inode * inode, long block, struct buffer_head *bh_result, int create) { return get_block(inode, block, bh_result, create); } void V2_minix_truncate(struct inode * inode) { truncate(inode); } unsigned V2_minix_blocks(loff_t size, struct super_block *sb) { return nblocks(size, sb); }
2 2 6 6 1 5 5 5 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 // SPDX-License-Identifier: GPL-2.0-only /* * Binary Increase Congestion control for TCP * Home page: * http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC * This is from the implementation of BICTCP in * Lison-Xu, Kahaled Harfoush, and Injong Rhee. * "Binary Increase Congestion Control for Fast, Long Distance * Networks" in InfoComm 2004 * Available from: * http://netsrv.csc.ncsu.edu/export/bitcp.pdf * * Unless BIC is enabled and congestion window is large * this behaves the same as the original Reno. */ #include <linux/mm.h> #include <linux/module.h> #include <net/tcp.h> #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta */ #define BICTCP_B 4 /* * In binary search, * go to point (max+min)/N */ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ static int initial_ssthresh; static int smooth_part = 20; module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); module_param(max_increment, int, 0644); MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); module_param(low_window, int, 0644); MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); /* BIC TCP Parameters */ struct bictcp { u32 cnt; /* increase cwnd by 1 after ACKs */ u32 last_max_cwnd; /* last maximum snd_cwnd */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ u32 epoch_start; /* beginning of an epoch */ #define ACK_RATIO_SHIFT 4 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ }; static inline void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; ca->epoch_start = 0; ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } static void bictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); bictcp_reset(ca); if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } /* * Compute congestion window to use. */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { if (ca->last_cwnd == cwnd && (s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32) return; ca->last_cwnd = cwnd; ca->last_time = tcp_jiffies32; if (ca->epoch_start == 0) /* record the beginning of an epoch */ ca->epoch_start = tcp_jiffies32; /* start off normal */ if (cwnd <= low_window) { ca->cnt = cwnd; return; } /* binary increase */ if (cwnd < ca->last_max_cwnd) { __u32 dist = (ca->last_max_cwnd - cwnd) / BICTCP_B; if (dist > max_increment) /* linear increase */ ca->cnt = cwnd / max_increment; else if (dist <= 1U) /* binary search increase */ ca->cnt = (cwnd * smooth_part) / BICTCP_B; else /* binary search increase */ ca->cnt = cwnd / dist; } else { /* slow start AMD linear increase */ if (cwnd < ca->last_max_cwnd + BICTCP_B) /* slow start */ ca->cnt = (cwnd * smooth_part) / BICTCP_B; else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) /* slow start */ ca->cnt = (cwnd * (BICTCP_B-1)) / (cwnd - ca->last_max_cwnd); else /* linear increase */ ca->cnt = cwnd / max_increment; } /* if in slow start or link utilization is very low */ if (ca->last_max_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; if (ca->cnt == 0) /* cannot be zero */ ca->cnt = 1; } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } bictcp_update(ca, tcp_snd_cwnd(tp)); tcp_cong_avoid_ai(tp, ca->cnt, acked); } /* * behave like Reno until low_window is reached, * then increase congestion window slowly */ static u32 bictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else ca->last_max_cwnd = tcp_snd_cwnd(tp); if (tcp_snd_cwnd(tp) <= low_window) return max(tcp_snd_cwnd(tp) >> 1U, 2U); else return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) bictcp_reset(inet_csk_ca(sk)); } /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); ca->delayed_ack += sample->pkts_acked - (ca->delayed_ack >> ACK_RATIO_SHIFT); } } static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", }; static int __init bictcp_register(void) { BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&bictcp); } static void __exit bictcp_unregister(void) { tcp_unregister_congestion_control(&bictcp); } module_init(bictcp_register); module_exit(bictcp_unregister); MODULE_AUTHOR("Stephen Hemminger"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("BIC TCP");
32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2015, Sony Mobile Communications Inc. * Copyright (c) 2013, The Linux Foundation. All rights reserved. * Copyright (c) 2020, Linaro Ltd. */ #include <linux/module.h> #include <linux/qrtr.h> #include <linux/workqueue.h> #include <net/sock.h> #include "qrtr.h" #include <trace/events/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> static DEFINE_XARRAY(nodes); static struct { struct socket *sock; struct sockaddr_qrtr bcast_sq; struct list_head lookups; struct workqueue_struct *workqueue; struct work_struct work; int local_node; } qrtr_ns; static const char * const qrtr_ctrl_pkt_strings[] = { [QRTR_TYPE_HELLO] = "hello", [QRTR_TYPE_BYE] = "bye", [QRTR_TYPE_NEW_SERVER] = "new-server", [QRTR_TYPE_DEL_SERVER] = "del-server", [QRTR_TYPE_DEL_CLIENT] = "del-client", [QRTR_TYPE_RESUME_TX] = "resume-tx", [QRTR_TYPE_EXIT] = "exit", [QRTR_TYPE_PING] = "ping", [QRTR_TYPE_NEW_LOOKUP] = "new-lookup", [QRTR_TYPE_DEL_LOOKUP] = "del-lookup", }; struct qrtr_server_filter { unsigned int service; unsigned int instance; unsigned int ifilter; }; struct qrtr_lookup { unsigned int service; unsigned int instance; struct sockaddr_qrtr sq; struct list_head li; }; struct qrtr_server { unsigned int service; unsigned int instance; unsigned int node; unsigned int port; struct list_head qli; }; struct qrtr_node { unsigned int id; struct xarray servers; }; static struct qrtr_node *node_get(unsigned int node_id) { struct qrtr_node *node; node = xa_load(&nodes, node_id); if (node) return node; /* If node didn't exist, allocate and insert it to the tree */ node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return NULL; node->id = node_id; xa_init(&node->servers); if (xa_store(&nodes, node_id, node, GFP_KERNEL)) { kfree(node); return NULL; } return node; } static int server_match(const struct qrtr_server *srv, const struct qrtr_server_filter *f) { unsigned int ifilter = f->ifilter; if (f->service != 0 && srv->service != f->service) return 0; if (!ifilter && f->instance) ifilter = ~0; return (srv->instance & ifilter) == f->instance; } static int service_announce_new(struct sockaddr_qrtr *dest, struct qrtr_server *srv) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; trace_qrtr_ns_service_announce_new(srv->service, srv->instance, srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_NEW_SERVER); pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); return kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); } static void service_announce_del(struct sockaddr_qrtr *dest, struct qrtr_server *srv) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; trace_qrtr_ns_service_announce_del(srv->service, srv->instance, srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_SERVER); pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) pr_err("failed to announce del service\n"); return; } static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, bool new) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = new ? cpu_to_le32(QRTR_TYPE_NEW_SERVER) : cpu_to_le32(QRTR_TYPE_DEL_SERVER); if (srv) { pkt.server.service = cpu_to_le32(srv->service); pkt.server.instance = cpu_to_le32(srv->instance); pkt.server.node = cpu_to_le32(srv->node); pkt.server.port = cpu_to_le32(srv->port); } msg.msg_name = (struct sockaddr *)to; msg.msg_namelen = sizeof(*to); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) pr_err("failed to send lookup notification\n"); } static int announce_servers(struct sockaddr_qrtr *sq) { struct qrtr_server *srv; struct qrtr_node *node; unsigned long index; int ret; node = node_get(qrtr_ns.local_node); if (!node) return 0; /* Announce the list of servers registered in this node */ xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { if (ret == -ENODEV) continue; pr_err("failed to announce new service\n"); return ret; } } return 0; } static struct qrtr_server *server_add(unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_server *srv; struct qrtr_server *old; struct qrtr_node *node; if (!service || !port) return NULL; srv = kzalloc(sizeof(*srv), GFP_KERNEL); if (!srv) return NULL; srv->service = service; srv->instance = instance; srv->node = node_id; srv->port = port; node = node_get(node_id); if (!node) goto err; /* Delete the old server on the same port */ old = xa_store(&node->servers, port, srv, GFP_KERNEL); if (old) { if (xa_is_err(old)) { pr_err("failed to add server [0x%x:0x%x] ret:%d\n", srv->service, srv->instance, xa_err(old)); goto err; } else { kfree(old); } } trace_qrtr_ns_server_add(srv->service, srv->instance, srv->node, srv->port); return srv; err: kfree(srv); return NULL; } static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) { struct qrtr_lookup *lookup; struct qrtr_server *srv; struct list_head *li; srv = xa_load(&node->servers, port); if (!srv) return -ENOENT; xa_erase(&node->servers, port); /* Broadcast the removal of local servers */ if (srv->node == qrtr_ns.local_node && bcast) service_announce_del(&qrtr_ns.bcast_sq, srv); /* Announce the service's disappearance to observers */ list_for_each(li, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->service && lookup->service != srv->service) continue; if (lookup->instance && lookup->instance != srv->instance) continue; lookup_notify(&lookup->sq, srv, false); } kfree(srv); return 0; } static int say_hello(struct sockaddr_qrtr *dest) { struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_HELLO); msg.msg_name = (struct sockaddr *)dest; msg.msg_namelen = sizeof(*dest); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0) pr_err("failed to send hello msg\n"); return ret; } /* Announce the list of servers registered on the local node */ static int ctrl_cmd_hello(struct sockaddr_qrtr *sq) { int ret; ret = say_hello(sq); if (ret < 0) return ret; return announce_servers(sq); } static int ctrl_cmd_bye(struct sockaddr_qrtr *from) { struct qrtr_node *local_node; struct qrtr_ctrl_pkt pkt; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct msghdr msg = { }; struct qrtr_node *node; unsigned long index; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); node = node_get(from->sq_node); if (!node) return 0; /* Advertise removal of this client to all servers of remote node */ xa_for_each(&node->servers, index, srv) server_del(node, srv->port, true); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); if (!local_node) return 0; memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE); pkt.client.node = cpu_to_le32(from->sq_node); xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) { pr_err("failed to send bye cmd\n"); return ret; } } return 0; } static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, unsigned int node_id, unsigned int port) { struct qrtr_node *local_node; struct qrtr_lookup *lookup; struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct qrtr_node *node; struct list_head *tmp; struct list_head *li; unsigned long index; struct kvec iv; int ret; iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); /* Don't accept spoofed messages */ if (from->sq_node != node_id) return -EINVAL; /* Local DEL_CLIENT messages comes from the port being closed */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; /* Remove any lookups by this client */ list_for_each_safe(li, tmp, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->sq.sq_node != node_id) continue; if (lookup->sq.sq_port != port) continue; list_del(&lookup->li); kfree(lookup); } /* Remove the server belonging to this port but don't broadcast * DEL_SERVER. Neighbours would've already removed the server belonging * to this port due to the DEL_CLIENT broadcast from qrtr_port_remove(). */ node = node_get(node_id); if (node) server_del(node, port, false); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); if (!local_node) return 0; memset(&pkt, 0, sizeof(pkt)); pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); pkt.client.node = cpu_to_le32(node_id); pkt.client.port = cpu_to_le32(port); xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt)); if (ret < 0 && ret != -ENODEV) { pr_err("failed to send del client cmd\n"); return ret; } } return 0; } static int ctrl_cmd_new_server(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_lookup *lookup; struct qrtr_server *srv; struct list_head *li; int ret = 0; /* Ignore specified node and port for local servers */ if (from->sq_node == qrtr_ns.local_node) { node_id = from->sq_node; port = from->sq_port; } srv = server_add(service, instance, node_id, port); if (!srv) return -EINVAL; if (srv->node == qrtr_ns.local_node) { ret = service_announce_new(&qrtr_ns.bcast_sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } } /* Notify any potential lookups about the new server */ list_for_each(li, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->service && lookup->service != service) continue; if (lookup->instance && lookup->instance != instance) continue; lookup_notify(&lookup->sq, srv, true); } return ret; } static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance, unsigned int node_id, unsigned int port) { struct qrtr_node *node; /* Ignore specified node and port for local servers*/ if (from->sq_node == qrtr_ns.local_node) { node_id = from->sq_node; port = from->sq_port; } /* Local servers may only unregister themselves */ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port) return -EINVAL; node = node_get(node_id); if (!node) return -ENOENT; server_del(node, port, true); return 0; } static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { struct qrtr_server_filter filter; struct qrtr_lookup *lookup; struct qrtr_server *srv; struct qrtr_node *node; unsigned long node_idx; unsigned long srv_idx; /* Accept only local observers */ if (from->sq_node != qrtr_ns.local_node) return -EINVAL; lookup = kzalloc(sizeof(*lookup), GFP_KERNEL); if (!lookup) return -ENOMEM; lookup->sq = *from; lookup->service = service; lookup->instance = instance; list_add_tail(&lookup->li, &qrtr_ns.lookups); memset(&filter, 0, sizeof(filter)); filter.service = service; filter.instance = instance; xa_for_each(&nodes, node_idx, node) { xa_for_each(&node->servers, srv_idx, srv) { if (!server_match(srv, &filter)) continue; lookup_notify(from, srv, true); } } /* Empty notification, to indicate end of listing */ lookup_notify(from, NULL, true); return 0; } static void ctrl_cmd_del_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { struct qrtr_lookup *lookup; struct list_head *tmp; struct list_head *li; list_for_each_safe(li, tmp, &qrtr_ns.lookups) { lookup = container_of(li, struct qrtr_lookup, li); if (lookup->sq.sq_node != from->sq_node) continue; if (lookup->sq.sq_port != from->sq_port) continue; if (lookup->service != service) continue; if (lookup->instance && lookup->instance != instance) continue; list_del(&lookup->li); kfree(lookup); } } static void qrtr_ns_worker(struct work_struct *work) { const struct qrtr_ctrl_pkt *pkt; size_t recv_buf_size = 4096; struct sockaddr_qrtr sq; struct msghdr msg = { }; unsigned int cmd; ssize_t msglen; void *recv_buf; struct kvec iv; int ret; msg.msg_name = (struct sockaddr *)&sq; msg.msg_namelen = sizeof(sq); recv_buf = kzalloc(recv_buf_size, GFP_KERNEL); if (!recv_buf) return; for (;;) { iv.iov_base = recv_buf; iv.iov_len = recv_buf_size; msglen = kernel_recvmsg(qrtr_ns.sock, &msg, &iv, 1, iv.iov_len, MSG_DONTWAIT); if (msglen == -EAGAIN) break; if (msglen < 0) { pr_err("error receiving packet: %zd\n", msglen); break; } pkt = recv_buf; cmd = le32_to_cpu(pkt->cmd); if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) && qrtr_ctrl_pkt_strings[cmd]) trace_qrtr_ns_message(qrtr_ctrl_pkt_strings[cmd], sq.sq_node, sq.sq_port); ret = 0; switch (cmd) { case QRTR_TYPE_HELLO: ret = ctrl_cmd_hello(&sq); break; case QRTR_TYPE_BYE: ret = ctrl_cmd_bye(&sq); break; case QRTR_TYPE_DEL_CLIENT: ret = ctrl_cmd_del_client(&sq, le32_to_cpu(pkt->client.node), le32_to_cpu(pkt->client.port)); break; case QRTR_TYPE_NEW_SERVER: ret = ctrl_cmd_new_server(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance), le32_to_cpu(pkt->server.node), le32_to_cpu(pkt->server.port)); break; case QRTR_TYPE_DEL_SERVER: ret = ctrl_cmd_del_server(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance), le32_to_cpu(pkt->server.node), le32_to_cpu(pkt->server.port)); break; case QRTR_TYPE_EXIT: case QRTR_TYPE_PING: case QRTR_TYPE_RESUME_TX: break; case QRTR_TYPE_NEW_LOOKUP: ret = ctrl_cmd_new_lookup(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance)); break; case QRTR_TYPE_DEL_LOOKUP: ctrl_cmd_del_lookup(&sq, le32_to_cpu(pkt->server.service), le32_to_cpu(pkt->server.instance)); break; } if (ret < 0) pr_err("failed while handling packet from %d:%d", sq.sq_node, sq.sq_port); } kfree(recv_buf); } static void qrtr_ns_data_ready(struct sock *sk) { trace_sk_data_ready(sk); queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } int qrtr_ns_init(void) { struct sockaddr_qrtr sq; int ret; INIT_LIST_HEAD(&qrtr_ns.lookups); INIT_WORK(&qrtr_ns.work, qrtr_ns_worker); ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM, PF_QIPCRTR, &qrtr_ns.sock); if (ret < 0) return ret; ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq); if (ret < 0) { pr_err("failed to get socket name\n"); goto err_sock; } qrtr_ns.workqueue = alloc_ordered_workqueue("qrtr_ns_handler", 0); if (!qrtr_ns.workqueue) { ret = -ENOMEM; goto err_sock; } qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready; sq.sq_port = QRTR_PORT_CTRL; qrtr_ns.local_node = sq.sq_node; ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq)); if (ret < 0) { pr_err("failed to bind to socket\n"); goto err_wq; } qrtr_ns.bcast_sq.sq_family = AF_QIPCRTR; qrtr_ns.bcast_sq.sq_node = QRTR_NODE_BCAST; qrtr_ns.bcast_sq.sq_port = QRTR_PORT_CTRL; ret = say_hello(&qrtr_ns.bcast_sq); if (ret < 0) goto err_wq; /* As the qrtr ns socket owner and creator is the same module, we have * to decrease the qrtr module reference count to guarantee that it * remains zero after the ns socket is created, otherwise, executing * "rmmod" command is unable to make the qrtr module deleted after the * qrtr module is inserted successfully. * * However, the reference count is increased twice in * sock_create_kern(): one is to increase the reference count of owner * of qrtr socket's proto_ops struct; another is to increment the * reference count of owner of qrtr proto struct. Therefore, we must * decrement the module reference count twice to ensure that it keeps * zero after server's listening socket is created. Of course, we * must bump the module reference count twice as well before the socket * is closed. */ module_put(qrtr_ns.sock->ops->owner); module_put(qrtr_ns.sock->sk->sk_prot_creator->owner); return 0; err_wq: destroy_workqueue(qrtr_ns.workqueue); err_sock: sock_release(qrtr_ns.sock); return ret; } EXPORT_SYMBOL_GPL(qrtr_ns_init); void qrtr_ns_remove(void) { cancel_work_sync(&qrtr_ns.work); destroy_workqueue(qrtr_ns.workqueue); /* sock_release() expects the two references that were put during * qrtr_ns_init(). This function is only called during module remove, * so try_stop_module() has already set the refcnt to 0. Use * __module_get() instead of try_module_get() to successfully take two * references. */ __module_get(qrtr_ns.sock->ops->owner); __module_get(qrtr_ns.sock->sk->sk_prot_creator->owner); sock_release(qrtr_ns.sock); } EXPORT_SYMBOL_GPL(qrtr_ns_remove); MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>"); MODULE_DESCRIPTION("Qualcomm IPC Router Nameservice"); MODULE_LICENSE("Dual BSD/GPL");
1 1 1 1 1 1425 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 // SPDX-License-Identifier: GPL-2.0 /* * Dynamic byte queue limits. See include/linux/dynamic_queue_limits.h * * Copyright (c) 2011, Tom Herbert <therbert@google.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/dynamic_queue_limits.h> #include <linux/compiler.h> #include <linux/export.h> #include <trace/events/napi.h> #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) static void dql_check_stall(struct dql *dql, unsigned short stall_thrs) { unsigned long now; if (!stall_thrs) return; now = jiffies; /* Check for a potential stall */ if (time_after_eq(now, dql->last_reap + stall_thrs)) { unsigned long hist_head, t, start, end; /* We are trying to detect a period of at least @stall_thrs * jiffies without any Tx completions, but during first half * of which some Tx was posted. */ dqs_again: hist_head = READ_ONCE(dql->history_head); /* pairs with smp_wmb() in dql_queued() */ smp_rmb(); /* Get the previous entry in the ring buffer, which is the * oldest sample. */ start = (hist_head - DQL_HIST_LEN + 1) * BITS_PER_LONG; /* Advance start to continue from the last reap time */ if (time_before(start, dql->last_reap + 1)) start = dql->last_reap + 1; /* Newest sample we should have already seen a completion for */ end = hist_head * BITS_PER_LONG + (BITS_PER_LONG - 1); /* Shrink the search space to [start, (now - start_thrs/2)] if * `end` is beyond the stall zone */ if (time_before(now, end + stall_thrs / 2)) end = now - stall_thrs / 2; /* Search for the queued time in [t, end] */ for (t = start; time_before_eq(t, end); t++) if (test_bit(t % (DQL_HIST_LEN * BITS_PER_LONG), dql->history)) break; /* Variable t contains the time of the queue */ if (!time_before_eq(t, end)) goto no_stall; /* The ring buffer was modified in the meantime, retry */ if (hist_head != READ_ONCE(dql->history_head)) goto dqs_again; dql->stall_cnt++; dql->stall_max = max_t(unsigned short, dql->stall_max, now - t); trace_dql_stall_detected(dql->stall_thrs, now - t, dql->last_reap, dql->history_head, now, dql->history); } no_stall: dql->last_reap = now; } /* Records completed count and recalculates the queue limit */ void dql_completed(struct dql *dql, unsigned int count) { unsigned int inprogress, prev_inprogress, limit; unsigned int ovlimit, completed, num_queued; unsigned short stall_thrs; bool all_prev_completed; num_queued = READ_ONCE(dql->num_queued); /* Read stall_thrs in advance since it belongs to the same (first) * cache line as ->num_queued. This way, dql_check_stall() does not * need to touch the first cache line again later, reducing the window * of possible false sharing. */ stall_thrs = READ_ONCE(dql->stall_thrs); /* Can't complete more than what's in queue */ BUG_ON(count > num_queued - dql->num_completed); completed = dql->num_completed + count; limit = dql->limit; ovlimit = POSDIFF(num_queued - dql->num_completed, limit); inprogress = num_queued - completed; prev_inprogress = dql->prev_num_queued - dql->num_completed; all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued); if ((ovlimit && !inprogress) || (dql->prev_ovlimit && all_prev_completed)) { /* * Queue considered starved if: * - The queue was over-limit in the last interval, * and there is no more data in the queue. * OR * - The queue was over-limit in the previous interval and * when enqueuing it was possible that all queued data * had been consumed. This covers the case when queue * may have becomes starved between completion processing * running and next time enqueue was scheduled. * * When queue is starved increase the limit by the amount * of bytes both sent and completed in the last interval, * plus any previous over-limit. */ limit += POSDIFF(completed, dql->prev_num_queued) + dql->prev_ovlimit; dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } else if (inprogress && prev_inprogress && !all_prev_completed) { /* * Queue was not starved, check if the limit can be decreased. * A decrease is only considered if the queue has been busy in * the whole interval (the check above). * * If there is slack, the amount of excess data queued above * the amount needed to prevent starvation, the queue limit * can be decreased. To avoid hysteresis we consider the * minimum amount of slack found over several iterations of the * completion routine. */ unsigned int slack, slack_last_objs; /* * Slack is the maximum of * - The queue limit plus previous over-limit minus twice * the number of objects completed. Note that two times * number of completed bytes is a basis for an upper bound * of the limit. * - Portion of objects in the last queuing operation that * was not part of non-zero previous over-limit. That is * "round down" by non-overlimit portion of the last * queueing operation. */ slack = POSDIFF(limit + dql->prev_ovlimit, 2 * (completed - dql->num_completed)); slack_last_objs = dql->prev_ovlimit ? POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0; slack = max(slack, slack_last_objs); if (slack < dql->lowest_slack) dql->lowest_slack = slack; if (time_after(jiffies, dql->slack_start_time + dql->slack_hold_time)) { limit = POSDIFF(limit, dql->lowest_slack); dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } } /* Enforce bounds on limit */ limit = clamp(limit, dql->min_limit, dql->max_limit); if (limit != dql->limit) { dql->limit = limit; ovlimit = 0; } dql->adj_limit = limit + completed; dql->prev_ovlimit = ovlimit; dql->prev_last_obj_cnt = READ_ONCE(dql->last_obj_cnt); dql->num_completed = completed; dql->prev_num_queued = num_queued; dql_check_stall(dql, stall_thrs); } EXPORT_SYMBOL(dql_completed); void dql_reset(struct dql *dql) { /* Reset all dynamic values */ dql->limit = 0; dql->num_queued = 0; dql->num_completed = 0; dql->last_obj_cnt = 0; dql->prev_num_queued = 0; dql->prev_last_obj_cnt = 0; dql->prev_ovlimit = 0; dql->lowest_slack = UINT_MAX; dql->slack_start_time = jiffies; dql->last_reap = jiffies; dql->history_head = jiffies / BITS_PER_LONG; memset(dql->history, 0, sizeof(dql->history)); } EXPORT_SYMBOL(dql_reset); void dql_init(struct dql *dql, unsigned int hold_time) { dql->max_limit = DQL_MAX_LIMIT; dql->min_limit = 0; dql->slack_hold_time = hold_time; dql->stall_thrs = 0; dql_reset(dql); } EXPORT_SYMBOL(dql_init);
9 9 9 1 8 8 8 8 9 1 8 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle firewalling * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> #include <linux/uaccess.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif int br_validate_ipv6(struct net *net, struct sk_buff *skb) { const struct ipv6hdr *hdr; struct inet6_dev *idev = __in6_dev_get(skb->dev); u32 pkt_len; u8 ip6h_len = sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, ip6h_len)) goto inhdr_error; if (skb->len < ip6h_len) goto drop; hdr = ipv6_hdr(skb); if (hdr->version != 6) goto inhdr_error; pkt_len = ntohs(hdr->payload_len); if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len)) goto drop; if (pkt_len + ip6h_len > skb->len) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); /* No IP options in IPv6 header; however it should be * checked if some next headers need special treatment */ return 0; inhdr_error: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: return -1; } static inline bool br_nf_ipv6_daddr_was_changed(const struct sk_buff *skb, const struct nf_bridge_info *nf_bridge) { return memcmp(&nf_bridge->ipv6_daddr, &ipv6_hdr(skb)->daddr, sizeof(ipv6_hdr(skb)->daddr)) != 0; } /* PF_BRIDGE/PRE_ROUTING: Undo the changes made for ip6tables * PREROUTING and continue the bridge PRE_ROUTING hook. See comment * for br_nf_pre_routing_finish(), same logic is used here but * equivalent IPv6 function ip6_route_input() called indirectly. */ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct rtable *rt; struct net_device *dev = skb->dev, *br_indev; const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { kfree_skb(skb); return 0; } nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size; if (nf_bridge->pkt_otherhost) { skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } nf_bridge->in_prerouting = 0; if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { skb_dst_drop(skb); v6ops->route_input(skb); if (skb_dst(skb)->error) { kfree_skb(skb); return 0; } if (skb_dst(skb)->dev == dev) { skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); skb->pkt_type = PACKET_HOST; } else { rt = bridge_parent_rtable(br_indev); if (!rt) { kfree_skb(skb); return 0; } skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, br_handle_frame_finish); return 0; } /* Replicate the checks that IPv6 does on packet reception and pass the packet * to ip6tables. */ unsigned int br_nf_pre_routing_ipv6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_bridge_info *nf_bridge; if (br_validate_ipv6(state->net, skb)) return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0); nf_bridge = nf_bridge_alloc(skb); if (!nf_bridge) return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0); if (!setup_pre_routing(skb, state->net)) return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0); nf_bridge = nf_bridge_info_get(skb); nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr; skb->protocol = htons(ETH_P_IPV6); skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb, skb->dev, NULL, br_nf_pre_routing_finish_ipv6); return NF_STOLEN; }
1184 60 1185 60 71 12 69 163 163 155 155 3 174 25 174 26 28 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H #include <linux/mm.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/fs.h> #include <linux/hugetlb_inline.h> #include <linux/cgroup.h> #include <linux/page_ref.h> #include <linux/list.h> #include <linux/kref.h> #include <linux/pgtable.h> #include <linux/gfp.h> #include <linux/userfaultfd_k.h> struct ctl_table; struct user_struct; struct mmu_gather; struct node; void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE #include <linux/pagemap.h> #include <linux/shm.h> #include <asm/tlbflush.h> /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail * struct page to store the metadata. */ #define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; long count; long max_hpages; /* Maximum huge pages or -1 if no maximum. */ long used_hpages; /* Used count against maximum, includes */ /* both allocated and reserved pages. */ struct hstate *hstate; long min_hpages; /* Minimum huge pages or -1 if no minimum. */ long rsv_hpages; /* Pages reserved against global pool to */ /* satisfy minimum size. */ }; struct resv_map { struct kref refs; spinlock_t lock; struct list_head regions; long adds_in_progress; struct list_head region_cache; long region_cache_count; struct rw_semaphore rw_sema; #ifdef CONFIG_CGROUP_HUGETLB /* * On private mappings, the counter to uncharge reservations is stored * here. If these fields are 0, then either the mapping is shared, or * cgroup accounting is disabled for this resv_map. */ struct page_counter *reservation_counter; unsigned long pages_per_hpage; struct cgroup_subsys_state *css; #endif }; /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * * The region data structures are embedded into a resv_map and protected * by a resv_map's lock. The set of regions within the resv_map represent * reservations for huge pages, or huge pages that have already been * instantiated within the map. The from and to elements are huge page * indices into the associated mapping. from indicates the starting index * of the region. to represents the first index past the end of the region. * * For example, a file region structure with from == 0 and to == 4 represents * four huge pages in a mapping. It is important to note that the to element * represents the first element past the end of the region. This is used in * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. * * Interval notation of the form [from, to) will be used to indicate that * the endpoint from is inclusive and to is exclusive. */ struct file_region { struct list_head link; long from; long to; #ifdef CONFIG_CGROUP_HUGETLB /* * On shared mappings, each reserved region appears as a struct * file_region in resv_map. These fields hold the info needed to * uncharge each reservation. */ struct page_counter *reservation_counter; struct cgroup_subsys_state *css; #endif }; struct hugetlb_vma_lock { struct kref refs; struct rw_semaphore rw_sema; struct vm_area_struct *vma; }; extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); extern spinlock_t hugetlb_lock; extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo_node(int nid); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud); bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address); struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; /* arch callbacks */ #ifndef CONFIG_HIGHPTE /* * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures * which may go down to the lowest PTE level in their huge_pte_offset() and * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap(). */ static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address); } #endif pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); /* * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * * IMPORTANT: we should normally not directly call this function, instead * this is only a common interface to implement arch-specific * walker. Please use hugetlb_walk() instead, because that will attempt to * verify the locking for you. * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be * responsible of its thread safety. One can follow this rule: * * (1) For private mappings: pmd unsharing is not possible, so holding the * mmap_lock for either read or write is sufficient. Most callers * already hold the mmap_lock, so normally, no special action is * required. * * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged * pgtable page can go away from under us! It can be done by a pmd * unshare with a follow up munmap() on the other process), then we * need either: * * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare * won't happen upon the range (it also makes sure the pte_t we * read is the right and stable one), or, * * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make * sure even if unshare happened the racy unmap() will wait until * i_mmap_rwsem is released. * * Option (2.1) is the safest, which guarantees pte stability from pmd * sharing pov, until the vma lock released. Option (2.2) doesn't protect * a concurrent pmd unshare, but it makes sure the pgtable page is safe to * access. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); extern void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *begin, unsigned long *end); extern void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details); static inline void hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_begin(vma, start, end); } static inline void hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_end(vma, details); } void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); void hugetlb_vma_lock_write(struct vm_area_struct *vma); void hugetlb_vma_unlock_write(struct vm_area_struct *vma); int hugetlb_vma_trylock_write(struct vm_area_struct *vma); void hugetlb_vma_assert_locked(struct vm_area_struct *vma); void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { } static inline unsigned long hugetlb_total_pages(void) { return 0; } static inline struct address_space *hugetlb_folio_mapping_lock_write( struct folio *folio) { return NULL; } static inline int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_begin( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_end( struct vm_area_struct *vma, struct zap_details *details) { } static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { BUG(); return 0; } static inline int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { BUG(); return 0; } static inline void hugetlb_report_meminfo(struct seq_file *m) { } static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid) { return 0; } static inline void hugetlb_show_meminfo_node(int nid) { } static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { return -EINVAL; } static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { } static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { return 1; } static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { } static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { BUG(); } #ifdef CONFIG_USERFAULTFD static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { BUG(); return 0; } #endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { return NULL; } static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void folio_putback_hugetlb(struct folio *folio) { } static inline void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { } static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { return 0; } static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { BUG(); } static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { BUG(); return 0; } static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write static inline int pgd_write(pgd_t pgd) { BUG(); return 0; } #endif #define HUGETLB_ANON_FILE "anon_hugepage" enum { /* * The file will be used as an shm file so shmfs accounting rules * apply */ HUGETLB_SHMFS_INODE = 1, /* * The file is being created on the internal vfs mount and shmfs * accounting rules do not apply */ HUGETLB_ANONHUGE_INODE = 2, }; #ifdef CONFIG_HUGETLBFS struct hugetlbfs_sb_info { long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; struct hstate *hstate; struct hugepage_subpool *spool; kuid_t uid; kgid_t gid; umode_t mode; }; static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) { return sb->s_fs_info; } struct hugetlbfs_inode_info { struct inode vfs_inode; unsigned int seals; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) { return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, int creat_flags, int page_size_log); static inline bool is_file_hugepages(const struct file *file) { return file->f_op->fop_flags & FOP_HUGE_PAGES; } static inline struct hstate *hstate_inode(struct inode *i) { return HUGETLBFS_SB(i->i_sb)->hstate; } #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) false static inline struct file * hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); } static inline struct hstate *hstate_inode(struct inode *i) { return NULL; } #endif /* !CONFIG_HUGETLBFS */ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be * used to manipulate these flags. * * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. * Synchronization: Examined or modified by code that knows it has * the only reference to page. i.e. After allocation but before use * or when the page is being freed. * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. * Synchronization: Initially set after new page allocation with no * locking. When examined and modified during migration processing * (isolate, migrate, putback) the hugetlb_lock is held. * HPG_temporary - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. * Synchronization: Can be set after huge page allocation from buddy when * code knows it has only reference. All other examinations and * modifications require hugetlb_lock. * HPG_freed - Set when page is on the free lists. * Synchronization: hugetlb_lock held for examination and modification. * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed. * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page * that is not tracked by raw_hwp_page list. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, HPG_temporary, HPG_freed, HPG_vmemmap_optimized, HPG_raw_hwp_unreliable, __NR_HPAGEFLAGS, }; /* * Macros to create test, set and clear function definitions for * hugetlb specific page flags. */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ { } #endif #define HPAGEFLAG(uname, flname) \ TESTHPAGEFLAG(uname, flname) \ SETHPAGEFLAG(uname, flname) \ CLEARHPAGEFLAG(uname, flname) \ /* * Create functions associated with hugetlb page flags */ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) #ifdef CONFIG_HUGETLB_PAGE #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { struct mutex resize_lock; struct lock_class_key resize_key; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; unsigned int demote_order; unsigned long mask; unsigned long max_huge_pages; unsigned long nr_huge_pages; unsigned long free_huge_pages; unsigned long resv_huge_pages; unsigned long surplus_huge_pages; unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; char name[HSTATE_NAME_LEN]; }; struct huge_bootmem_page { struct list_head list; struct hstate *hstate; }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); int __init alloc_bootmem_huge_page(struct hstate *h, int nid); bool __init hugetlb_node_alloc_supported(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); #ifndef HUGE_MAX_HSTATE #define HUGE_MAX_HSTATE 1 #endif extern struct hstate hstates[HUGE_MAX_HSTATE]; extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return folio->_hugetlb_subpool; } static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { folio->_hugetlb_subpool = subpool; } static inline struct hstate *hstate_file(struct file *f) { return hstate_inode(file_inode(f)); } static inline struct hstate *hstate_sizelog(int page_size_log) { if (!page_size_log) return &default_hstate; if (page_size_log < BITS_PER_LONG) return size_to_hstate(1UL << page_size_log); return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return hstate_file(vma->vm_file); } static inline unsigned long huge_page_size(const struct hstate *h) { return (unsigned long)PAGE_SIZE << h->order; } extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; } static inline unsigned int huge_page_order(struct hstate *h) { return h->order; } static inline unsigned huge_page_shift(struct hstate *h) { return h->order + PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) { return 1 << h->order; } static inline unsigned int blocks_per_huge_page(struct hstate *h) { return huge_page_size(h) / 512; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return filemap_lock_folio(mapping, idx << huge_page_order(h)); } #include <asm/hugetlb.h> #ifndef is_hugepage_only_range static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } #define is_hugepage_only_range is_hugepage_only_range #endif #ifndef arch_clear_hugetlb_flags static inline void arch_clear_hugetlb_flags(struct folio *folio) { } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags #endif #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { return pte_mkhuge(entry); } #endif static inline struct hstate *folio_hstate(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); return size_to_hstate(folio_size(folio)); } static inline unsigned hstate_index_to_shift(unsigned index) { return hstates[index].order + PAGE_SHIFT; } static inline int hstate_index(struct hstate *h) { return h - hstates; } int dissolve_free_hugetlb_folio(struct folio *folio); int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION #ifndef arch_hugetlb_migration_supported static inline bool arch_hugetlb_migration_supported(struct hstate *h) { if ((huge_page_shift(h) == PMD_SHIFT) || (huge_page_shift(h) == PUD_SHIFT) || (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; } #endif #else static inline bool arch_hugetlb_migration_supported(struct hstate *h) { return false; } #endif static inline bool hugepage_migration_supported(struct hstate *h) { return arch_hugetlb_migration_supported(h); } /* * Movability check is different as compared to migration check. * It determines whether or not a huge page should be placed on * movable zone or not. Movability of any huge page should be * required only if huge page size is supported for migration. * There won't be any reason for the huge page to be movable if * it is not migratable to start with. Also the size of the huge * page should be large enough to be placed under a movable zone * and still feasible enough to be migratable. Just the presence * in movable zone does not make the migration feasible. * * So even though large huge page sizes like the gigantic ones * are migratable they should not be movable because its not * feasible to migrate them from movable zone. */ static inline bool hugepage_movable_supported(struct hstate *h) { if (!hugepage_migration_supported(h)) return false; if (hstate_is_gigantic(h)) return false; return true; } /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { gfp_t gfp = __GFP_COMP | __GFP_NOWARN; gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER; return gfp; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { gfp_t modified_mask = htlb_alloc_mask(h); /* Some callers might want to enforce node */ modified_mask |= (gfp_mask & __GFP_THISNODE); modified_mask |= (gfp_mask & __GFP_NOWARN); return modified_mask; } static inline bool htlb_allow_alloc_fallback(int reason) { bool allowed_fallback = false; /* * Note: the memory offline, memory failure and migration syscalls will * be allowed to fallback to other nodes due to lack of a better chioce, * that might break the per-node hugetlb pool. While other cases will * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool. */ switch (reason) { case MR_MEMORY_HOTPLUG: case MR_MEMORY_FAILURE: case MR_SYSCALL: case MR_MEMPOLICY_MBIND: allowed_fallback = true; break; default: break; } return allowed_fallback; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { const unsigned long size = huge_page_size(h); VM_WARN_ON(size == PAGE_SIZE); /* * hugetlb must use the exact same PT locks as core-mm page table * walkers would. When modifying a PTE table, hugetlb must take the * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD * PT lock etc. * * The expectation is that any hugetlb folio smaller than a PMD is * always mapped into a single PTE table and that any hugetlb folio * smaller than a PUD (but at least as big as a PMD) is always mapped * into a single PMD table. * * If that does not hold for an architecture, then that architecture * must disable split PT locks such that all *_lockptr() functions * will give us the same result: the per-MM PT lock. * * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() * and core-mm would use pmd_lockptr(). However, in such configurations * split PMD locks are disabled -- they don't make sense on a single * PGDIR page table -- and the end result is the same. */ if (size >= PUD_SIZE) return pud_lockptr(mm, (pud_t *) pte); else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) return pmd_lockptr(mm, (pmd_t *) pte); /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ return ptep_lockptr(mm, pte); } #ifndef hugepages_supported /* * Some platform decide whether they support huge pages at boot * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0 * when there is no such support */ #define hugepages_supported() (HPAGE_SHIFT != 0) #endif void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); static inline void hugetlb_count_init(struct mm_struct *mm) { atomic_long_set(&mm->hugetlb_usage, 0); } static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { atomic_long_sub(l, &mm->hugetlb_usage); } #ifndef huge_ptep_modify_prot_start #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); } #endif #ifndef huge_ptep_modify_prot_commit #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { unsigned long psize = huge_page_size(hstate_vma(vma)); set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } #endif #ifdef CONFIG_NUMA void hugetlb_register_node(struct node *node); void hugetlb_unregister_node(struct node *node); #endif /* * Check if a given raw @page in a hugepage is HWPOISON. */ bool is_raw_hwpoison_page_in_hugepage(struct page *page); static inline unsigned long huge_page_mask_align(struct file *file) { return PAGE_MASK & ~huge_page_mask(hstate_file(file)); } #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; static inline unsigned long huge_page_mask_align(struct file *file) { return 0; } static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return NULL; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return NULL; } static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { return -ENOMEM; } static inline int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { return 0; } static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) { return NULL; } static inline struct folio * alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; } static inline struct folio * alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { return NULL; } static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; } static inline struct hstate *hstate_file(struct file *f) { return NULL; } static inline struct hstate *hstate_sizelog(int page_size_log) { return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return NULL; } static inline struct hstate *folio_hstate(struct folio *folio) { return NULL; } static inline struct hstate *size_to_hstate(unsigned long size) { return NULL; } static inline unsigned long huge_page_size(struct hstate *h) { return PAGE_SIZE; } static inline unsigned long huge_page_mask(struct hstate *h) { return PAGE_MASK; } static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned int huge_page_order(struct hstate *h) { return 0; } static inline unsigned int huge_page_shift(struct hstate *h) { return PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return false; } static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; } static inline unsigned hstate_index_to_shift(unsigned index) { return 0; } static inline int hstate_index(struct hstate *h) { return 0; } static inline int dissolve_free_hugetlb_folio(struct folio *folio) { return 0; } static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { return 0; } static inline bool hugepage_migration_supported(struct hstate *h) { return false; } static inline bool hugepage_movable_supported(struct hstate *h) { return false; } static inline gfp_t htlb_alloc_mask(struct hstate *h) { return 0; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { return 0; } static inline bool htlb_allow_alloc_fallback(int reason) { return false; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void hugetlb_count_init(struct mm_struct *mm) { } static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_MMU return ptep_get(ptep); #else return *ptep; #endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { } static inline void hugetlb_register_node(struct node *node) { } static inline void hugetlb_unregister_node(struct node *node) { } static inline bool hugetlbfs_pagecache_present( struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return false; } #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { spinlock_t *ptl; ptl = huge_pte_lockptr(h, mm, pte); spin_lock(ptl); return ptl; } #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) extern void __init hugetlb_cma_reserve(int order); #else static inline __init void hugetlb_cma_reserve(int order) { } #endif #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { return page_count(virt_to_page(pte)) > 1; } #else static inline bool hugetlb_pmd_shared(pte_t *pte) { return false; } #endif bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE /* * ARCHes with special requirements for evicting HUGETLB backing TLB entries can * implement this. */ #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif static inline bool __vma_shareable_lock(struct vm_area_struct *vma) { return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } bool __vma_private_lock(struct vm_area_struct *vma); /* * Safe version of huge_pte_offset() to check the locks. See comments * above huge_pte_offset(). */ static inline pte_t * hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { #if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; /* * If pmd sharing possible, locking needed to safely walk the * hugetlb pgtables. More information can be found at the comment * above huge_pte_offset() in the same file. * * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. */ if (__vma_shareable_lock(vma)) WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && !lockdep_is_held( &vma->vm_file->f_mapping->i_mmap_rwsem)); #endif return huge_pte_offset(vma->vm_mm, addr, sz); } #endif /* _LINUX_HUGETLB_H */
4 4 56 2 3 52 52 51 46 48 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/atari.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include <linux/ctype.h> #include "check.h" #include "atari.h" /* ++guenther: this should be settable by the user ("make config")?. */ #define ICD_PARTS /* check if a partition entry looks valid -- Atari format is assumed if at least one of the primary entries is ok this way */ #define VALID_PARTITION(pi,hdsiz) \ (((pi)->flg & 1) && \ isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ be32_to_cpu((pi)->st) <= (hdsiz) && \ be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) static inline int OK_id(char *s) { return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || memcmp (s, "RAW", 3) == 0 ; } int atari_partition(struct parsed_partitions *state) { Sector sect; struct rootsector *rs; struct partition_info *pi; u32 extensect; u32 hd_size; int slot; #ifdef ICD_PARTS int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif /* * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, &sect); if (!rs) return -1; /* Verify this is an Atari rootsector: */ hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && !VALID_PARTITION(&rs->part[3], hd_size)) { /* * if there's no valid primary partition, assume that no Atari * format partition table (there's no reliable magic or the like * :-() */ put_dev_sector(sect); return 0; } pi = &rs->part[0]; strlcat(state->pp_buf, " AHDI", PAGE_SIZE); for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { struct rootsector *xrs; Sector sect2; ulong partsect; if ( !(pi->flg & 1) ) continue; /* active partition */ if (memcmp (pi->id, "XGM", 3) != 0) { /* we don't care about other id's */ put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); continue; } /* extension partition */ #ifdef ICD_PARTS part_fmt = 1; #endif strlcat(state->pp_buf, " XGM<", PAGE_SIZE); partsect = extensect = be32_to_cpu(pi->st); while (1) { xrs = read_part_sector(state, partsect, &sect2); if (!xrs) { printk (" block %ld read failed\n", partsect); put_dev_sector(sect); return -1; } /* ++roman: sanity check: bit 0 of flg field must be set */ if (!(xrs->part[0].flg & 1)) { printk( "\nFirst sub-partition in extended partition is not valid!\n" ); put_dev_sector(sect2); break; } put_partition(state, slot, partsect + be32_to_cpu(xrs->part[0].st), be32_to_cpu(xrs->part[0].siz)); if (!(xrs->part[1].flg & 1)) { /* end of linked partition list */ put_dev_sector(sect2); break; } if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { printk("\nID of extended partition is not XGM!\n"); put_dev_sector(sect2); break; } partsect = be32_to_cpu(xrs->part[1].st) + extensect; put_dev_sector(sect2); if (++slot == state->limit) { printk( "\nMaximum number of partitions reached!\n" ); break; } } strlcat(state->pp_buf, " >", PAGE_SIZE); } #ifdef ICD_PARTS if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ pi = &rs->icdpart[0]; /* sanity check: no ICD format if first partition invalid */ if (OK_id(pi->id)) { strlcat(state->pp_buf, " ICD<", PAGE_SIZE); for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); } strlcat(state->pp_buf, " >", PAGE_SIZE); } } #endif put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
9 1 3 5 9 1 64 6 1 1 1 1 2 63 63 62 60 10 1 1 3 1 1 3 5 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 /* * linux/drivers/video/fbcmap.c -- Colormap handling for frame buffer devices * * Created 15 Jun 1997 by Geert Uytterhoeven * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/string.h> #include <linux/module.h> #include <linux/fb.h> #include <linux/slab.h> #include <linux/uaccess.h> static u16 red2[] __read_mostly = { 0x0000, 0xaaaa }; static u16 green2[] __read_mostly = { 0x0000, 0xaaaa }; static u16 blue2[] __read_mostly = { 0x0000, 0xaaaa }; static u16 red4[] __read_mostly = { 0x0000, 0xaaaa, 0x5555, 0xffff }; static u16 green4[] __read_mostly = { 0x0000, 0xaaaa, 0x5555, 0xffff }; static u16 blue4[] __read_mostly = { 0x0000, 0xaaaa, 0x5555, 0xffff }; static u16 red8[] __read_mostly = { 0x0000, 0x0000, 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa }; static u16 green8[] __read_mostly = { 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0x0000, 0x0000, 0x5555, 0xaaaa }; static u16 blue8[] __read_mostly = { 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa }; static u16 red16[] __read_mostly = { 0x0000, 0x0000, 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa, 0x5555, 0x5555, 0x5555, 0x5555, 0xffff, 0xffff, 0xffff, 0xffff }; static u16 green16[] __read_mostly = { 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0x0000, 0x0000, 0x5555, 0xaaaa, 0x5555, 0x5555, 0xffff, 0xffff, 0x5555, 0x5555, 0xffff, 0xffff }; static u16 blue16[] __read_mostly = { 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x5555, 0xffff, 0x5555, 0xffff, 0x5555, 0xffff, 0x5555, 0xffff }; static const struct fb_cmap default_2_colors = { .len=2, .red=red2, .green=green2, .blue=blue2 }; static const struct fb_cmap default_8_colors = { .len=8, .red=red8, .green=green8, .blue=blue8 }; static const struct fb_cmap default_4_colors = { .len=4, .red=red4, .green=green4, .blue=blue4 }; static const struct fb_cmap default_16_colors = { .len=16, .red=red16, .green=green16, .blue=blue16 }; /** * fb_alloc_cmap_gfp - allocate a colormap * @cmap: frame buffer colormap structure * @len: length of @cmap * @transp: boolean, 1 if there is transparency, 0 otherwise * @flags: flags for kmalloc memory allocation * * Allocates memory for a colormap @cmap. @len is the * number of entries in the palette. * * Returns negative errno on error, or zero on success. * */ int fb_alloc_cmap_gfp(struct fb_cmap *cmap, int len, int transp, gfp_t flags) { int size = len * sizeof(u16); int ret = -ENOMEM; flags |= __GFP_NOWARN; if (cmap->len != len) { fb_dealloc_cmap(cmap); if (!len) return 0; cmap->red = kzalloc(size, flags); if (!cmap->red) goto fail; cmap->green = kzalloc(size, flags); if (!cmap->green) goto fail; cmap->blue = kzalloc(size, flags); if (!cmap->blue) goto fail; if (transp) { cmap->transp = kzalloc(size, flags); if (!cmap->transp) goto fail; } else { cmap->transp = NULL; } } cmap->start = 0; cmap->len = len; ret = fb_copy_cmap(fb_default_cmap(len), cmap); if (ret) goto fail; return 0; fail: fb_dealloc_cmap(cmap); return ret; } int fb_alloc_cmap(struct fb_cmap *cmap, int len, int transp) { return fb_alloc_cmap_gfp(cmap, len, transp, GFP_ATOMIC); } /** * fb_dealloc_cmap - deallocate a colormap * @cmap: frame buffer colormap structure * * Deallocates a colormap that was previously allocated with * fb_alloc_cmap(). * */ void fb_dealloc_cmap(struct fb_cmap *cmap) { kfree(cmap->red); kfree(cmap->green); kfree(cmap->blue); kfree(cmap->transp); cmap->red = cmap->green = cmap->blue = cmap->transp = NULL; cmap->len = 0; } /** * fb_copy_cmap - copy a colormap * @from: frame buffer colormap structure * @to: frame buffer colormap structure * * Copy contents of colormap from @from to @to. */ int fb_copy_cmap(const struct fb_cmap *from, struct fb_cmap *to) { unsigned int tooff = 0, fromoff = 0; size_t size; if (to->start > from->start) fromoff = to->start - from->start; else tooff = from->start - to->start; if (fromoff >= from->len || tooff >= to->len) return -EINVAL; size = min_t(size_t, to->len - tooff, from->len - fromoff); if (size == 0) return -EINVAL; size *= sizeof(u16); memcpy(to->red+tooff, from->red+fromoff, size); memcpy(to->green+tooff, from->green+fromoff, size); memcpy(to->blue+tooff, from->blue+fromoff, size); if (from->transp && to->transp) memcpy(to->transp+tooff, from->transp+fromoff, size); return 0; } int fb_cmap_to_user(const struct fb_cmap *from, struct fb_cmap_user *to) { unsigned int tooff = 0, fromoff = 0; size_t size; if (to->start > from->start) fromoff = to->start - from->start; else tooff = from->start - to->start; if (fromoff >= from->len || tooff >= to->len) return -EINVAL; size = min_t(size_t, to->len - tooff, from->len - fromoff); if (size == 0) return -EINVAL; size *= sizeof(u16); if (copy_to_user(to->red+tooff, from->red+fromoff, size)) return -EFAULT; if (copy_to_user(to->green+tooff, from->green+fromoff, size)) return -EFAULT; if (copy_to_user(to->blue+tooff, from->blue+fromoff, size)) return -EFAULT; if (from->transp && to->transp) if (copy_to_user(to->transp+tooff, from->transp+fromoff, size)) return -EFAULT; return 0; } /** * fb_set_cmap - set the colormap * @cmap: frame buffer colormap structure * @info: frame buffer info structure * * Sets the colormap @cmap for a screen of device @info. * * Returns negative errno on error, or zero on success. * */ int fb_set_cmap(struct fb_cmap *cmap, struct fb_info *info) { int i, start, rc = 0; u16 *red, *green, *blue, *transp; u_int hred, hgreen, hblue, htransp = 0xffff; red = cmap->red; green = cmap->green; blue = cmap->blue; transp = cmap->transp; start = cmap->start; if (start < 0 || (!info->fbops->fb_setcolreg && !info->fbops->fb_setcmap)) return -EINVAL; if (info->fbops->fb_setcmap) { rc = info->fbops->fb_setcmap(cmap, info); } else { for (i = 0; i < cmap->len; i++) { hred = *red++; hgreen = *green++; hblue = *blue++; if (transp) htransp = *transp++; if (info->fbops->fb_setcolreg(start++, hred, hgreen, hblue, htransp, info)) break; } } if (rc == 0) fb_copy_cmap(cmap, &info->cmap); return rc; } int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *info) { int rc, size = cmap->len * sizeof(u16); struct fb_cmap umap; if (size < 0 || size < cmap->len) return -E2BIG; memset(&umap, 0, sizeof(struct fb_cmap)); rc = fb_alloc_cmap_gfp(&umap, cmap->len, cmap->transp != NULL, GFP_KERNEL); if (rc) return rc; if (copy_from_user(umap.red, cmap->red, size) || copy_from_user(umap.green, cmap->green, size) || copy_from_user(umap.blue, cmap->blue, size) || (cmap->transp && copy_from_user(umap.transp, cmap->transp, size))) { rc = -EFAULT; goto out; } umap.start = cmap->start; lock_fb_info(info); rc = fb_set_cmap(&umap, info); unlock_fb_info(info); out: fb_dealloc_cmap(&umap); return rc; } /** * fb_default_cmap - get default colormap * @len: size of palette for a depth * * Gets the default colormap for a specific screen depth. @len * is the size of the palette for a particular screen depth. * * Returns pointer to a frame buffer colormap structure. * */ const struct fb_cmap *fb_default_cmap(int len) { if (len <= 2) return &default_2_colors; if (len <= 4) return &default_4_colors; if (len <= 8) return &default_8_colors; return &default_16_colors; } /** * fb_invert_cmaps - invert all defaults colormaps * * Invert all default colormaps. * */ void fb_invert_cmaps(void) { u_int i; for (i = 0; i < ARRAY_SIZE(red2); i++) { red2[i] = ~red2[i]; green2[i] = ~green2[i]; blue2[i] = ~blue2[i]; } for (i = 0; i < ARRAY_SIZE(red4); i++) { red4[i] = ~red4[i]; green4[i] = ~green4[i]; blue4[i] = ~blue4[i]; } for (i = 0; i < ARRAY_SIZE(red8); i++) { red8[i] = ~red8[i]; green8[i] = ~green8[i]; blue8[i] = ~blue8[i]; } for (i = 0; i < ARRAY_SIZE(red16); i++) { red16[i] = ~red16[i]; green16[i] = ~green16[i]; blue16[i] = ~blue16[i]; } } /* * Visible symbols for modules */ EXPORT_SYMBOL(fb_alloc_cmap); EXPORT_SYMBOL(fb_dealloc_cmap); EXPORT_SYMBOL(fb_copy_cmap); EXPORT_SYMBOL(fb_set_cmap); EXPORT_SYMBOL(fb_default_cmap); EXPORT_SYMBOL(fb_invert_cmaps);
16 16 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 // SPDX-License-Identifier: ISC /* * Copyright (c) 2005-2011 Atheros Communications Inc. * Copyright (c) 2011-2017 Qualcomm Atheros, Inc. * Copyright (c) 2018-2019, The Linux Foundation. All rights reserved. * Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */ #include <linux/module.h> #include <linux/firmware.h> #include <linux/of.h> #include <linux/property.h> #include <linux/dmi.h> #include <linux/ctype.h> #include <linux/pm_qos.h> #include <linux/nvmem-consumer.h> #include <asm/byteorder.h> #include "core.h" #include "mac.h" #include "htc.h" #include "hif.h" #include "wmi.h" #include "bmi.h" #include "debug.h" #include "htt.h" #include "testmode.h" #include "wmi-ops.h" #include "coredump.h" #include "leds.h" unsigned int ath10k_debug_mask; EXPORT_SYMBOL(ath10k_debug_mask); static unsigned int ath10k_cryptmode_param; static bool uart_print; static bool skip_otp; static bool fw_diag_log; /* frame mode values are mapped as per enum ath10k_hw_txrx_mode */ unsigned int ath10k_frame_mode = ATH10K_HW_TXRX_NATIVE_WIFI; unsigned long ath10k_coredump_mask = BIT(ATH10K_FW_CRASH_DUMP_REGISTERS) | BIT(ATH10K_FW_CRASH_DUMP_CE_DATA); /* FIXME: most of these should be readonly */ module_param_named(debug_mask, ath10k_debug_mask, uint, 0644); module_param_named(cryptmode, ath10k_cryptmode_param, uint, 0644); module_param(uart_print, bool, 0644); module_param(skip_otp, bool, 0644); module_param(fw_diag_log, bool, 0644); module_param_named(frame_mode, ath10k_frame_mode, uint, 0644); module_param_named(coredump_mask, ath10k_coredump_mask, ulong, 0444); MODULE_PARM_DESC(debug_mask, "Debugging mask"); MODULE_PARM_DESC(uart_print, "Uart target debugging"); MODULE_PARM_DESC(skip_otp, "Skip otp failure for calibration in testmode"); MODULE_PARM_DESC(cryptmode, "Crypto mode: 0-hardware, 1-software"); MODULE_PARM_DESC(frame_mode, "Datapath frame mode (0: raw, 1: native wifi (default), 2: ethernet)"); MODULE_PARM_DESC(coredump_mask, "Bitfield of what to include in firmware crash file"); MODULE_PARM_DESC(fw_diag_log, "Diag based fw log debugging"); static const struct ath10k_hw_params ath10k_hw_params_list[] = { { .id = QCA988X_HW_2_0_VERSION, .dev_id = QCA988X_2_0_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca988x hw2.0", .patch_load_addr = QCA988X_HW_2_0_PATCH_LOAD_ADDR, .uart_pin = 7, .led_pin = 1, .cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_ALL, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 2116, .fw = { .dir = QCA988X_HW_2_0_FW_DIR, .board_size = QCA988X_BOARD_DATA_SZ, .board_ext_size = QCA988X_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca988x_ops, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = true, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA988X_HW_2_0_VERSION, .dev_id = QCA988X_2_0_DEVICE_ID_UBNT, .name = "qca988x hw2.0 ubiquiti", .patch_load_addr = QCA988X_HW_2_0_PATCH_LOAD_ADDR, .uart_pin = 7, .led_pin = 0, .cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_ALL, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 2116, .fw = { .dir = QCA988X_HW_2_0_FW_DIR, .board_size = QCA988X_BOARD_DATA_SZ, .board_ext_size = QCA988X_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca988x_ops, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = true, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA9887_HW_1_0_VERSION, .dev_id = QCA9887_1_0_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca9887 hw1.0", .patch_load_addr = QCA9887_HW_1_0_PATCH_LOAD_ADDR, .uart_pin = 7, .led_pin = 1, .cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_ALL, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 2116, .fw = { .dir = QCA9887_HW_1_0_FW_DIR, .board_size = QCA9887_BOARD_DATA_SZ, .board_ext_size = QCA9887_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca988x_ops, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA6174_HW_3_2_VERSION, .dev_id = QCA6174_3_2_DEVICE_ID, .bus = ATH10K_BUS_SDIO, .name = "qca6174 hw3.2 sdio", .patch_load_addr = QCA6174_HW_3_0_PATCH_LOAD_ADDR, .uart_pin = 19, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 0, .fw = { .dir = QCA6174_HW_3_0_FW_DIR, .board_size = QCA6174_BOARD_DATA_SZ, .board_ext_size = QCA6174_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca6174_sdio_ops, .hw_clk = qca6174_clk, .target_cpu_freq = 176000000, .decap_align_bytes = 4, .n_cipher_suites = 8, .num_peers = 10, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .uart_pin_workaround = true, .tx_stats_over_pktlog = false, .credit_size_workaround = false, .bmi_large_size_download = true, .supports_peer_stats_info = true, .dynamic_sar_support = true, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA6174_HW_2_1_VERSION, .dev_id = QCA6164_2_1_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca6164 hw2.1", .patch_load_addr = QCA6174_HW_2_1_PATCH_LOAD_ADDR, .uart_pin = 6, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 8124, .fw = { .dir = QCA6174_HW_2_1_FW_DIR, .board_size = QCA6174_BOARD_DATA_SZ, .board_ext_size = QCA6174_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca988x_ops, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA6174_HW_2_1_VERSION, .dev_id = QCA6174_2_1_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca6174 hw2.1", .patch_load_addr = QCA6174_HW_2_1_PATCH_LOAD_ADDR, .uart_pin = 6, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 8124, .fw = { .dir = QCA6174_HW_2_1_FW_DIR, .board_size = QCA6174_BOARD_DATA_SZ, .board_ext_size = QCA6174_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca988x_ops, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA6174_HW_3_0_VERSION, .dev_id = QCA6174_2_1_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca6174 hw3.0", .patch_load_addr = QCA6174_HW_3_0_PATCH_LOAD_ADDR, .uart_pin = 6, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 8124, .fw = { .dir = QCA6174_HW_3_0_FW_DIR, .board_size = QCA6174_BOARD_DATA_SZ, .board_ext_size = QCA6174_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca988x_ops, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA6174_HW_3_2_VERSION, .dev_id = QCA6174_2_1_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca6174 hw3.2", .patch_load_addr = QCA6174_HW_3_0_PATCH_LOAD_ADDR, .uart_pin = 6, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 8124, .fw = { /* uses same binaries as hw3.0 */ .dir = QCA6174_HW_3_0_FW_DIR, .board_size = QCA6174_BOARD_DATA_SZ, .board_ext_size = QCA6174_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca6174_ops, .hw_clk = qca6174_clk, .target_cpu_freq = 176000000, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = true, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .supports_peer_stats_info = true, .dynamic_sar_support = true, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = true, }, { .id = QCA99X0_HW_2_0_DEV_VERSION, .dev_id = QCA99X0_2_0_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca99x0 hw2.0", .patch_load_addr = QCA99X0_HW_2_0_PATCH_LOAD_ADDR, .uart_pin = 7, .led_pin = 17, .otp_exe_param = 0x00000700, .continuous_frag_desc = true, .cck_rate_map_rev2 = true, .channel_counters_freq_hz = 150000, .max_probe_resp_desc_thres = 24, .tx_chain_mask = 0xf, .rx_chain_mask = 0xf, .max_spatial_stream = 4, .cal_data_len = 12064, .fw = { .dir = QCA99X0_HW_2_0_FW_DIR, .board_size = QCA99X0_BOARD_DATA_SZ, .board_ext_size = QCA99X0_BOARD_EXT_DATA_SZ, }, .sw_decrypt_mcast_mgmt = true, .rx_desc_ops = &qca99x0_rx_desc_ops, .hw_ops = &qca99x0_ops, .decap_align_bytes = 1, .spectral_bin_discard = 4, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 11, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA9984_HW_1_0_DEV_VERSION, .dev_id = QCA9984_1_0_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca9984/qca9994 hw1.0", .patch_load_addr = QCA9984_HW_1_0_PATCH_LOAD_ADDR, .uart_pin = 7, .led_pin = 17, .cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_EACH, .otp_exe_param = 0x00000700, .continuous_frag_desc = true, .cck_rate_map_rev2 = true, .channel_counters_freq_hz = 150000, .max_probe_resp_desc_thres = 24, .tx_chain_mask = 0xf, .rx_chain_mask = 0xf, .max_spatial_stream = 4, .cal_data_len = 12064, .fw = { .dir = QCA9984_HW_1_0_FW_DIR, .board_size = QCA99X0_BOARD_DATA_SZ, .board_ext_size = QCA99X0_BOARD_EXT_DATA_SZ, .ext_board_size = QCA99X0_EXT_BOARD_DATA_SZ, }, .sw_decrypt_mcast_mgmt = true, .rx_desc_ops = &qca99x0_rx_desc_ops, .hw_ops = &qca99x0_ops, .decap_align_bytes = 1, .spectral_bin_discard = 12, .spectral_bin_offset = 8, /* Can do only 2x2 VHT160 or 80+80. 1560Mbps is 4x4 80Mhz * or 2x2 160Mhz, long-guard-interval. */ .vht160_mcs_rx_highest = 1560, .vht160_mcs_tx_highest = 1560, .n_cipher_suites = 11, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA9888_HW_2_0_DEV_VERSION, .dev_id = QCA9888_2_0_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca9888 hw2.0", .patch_load_addr = QCA9888_HW_2_0_PATCH_LOAD_ADDR, .uart_pin = 7, .led_pin = 17, .cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_EACH, .otp_exe_param = 0x00000700, .continuous_frag_desc = true, .channel_counters_freq_hz = 150000, .max_probe_resp_desc_thres = 24, .tx_chain_mask = 3, .rx_chain_mask = 3, .max_spatial_stream = 2, .cal_data_len = 12064, .fw = { .dir = QCA9888_HW_2_0_FW_DIR, .board_size = QCA99X0_BOARD_DATA_SZ, .board_ext_size = QCA99X0_BOARD_EXT_DATA_SZ, }, .sw_decrypt_mcast_mgmt = true, .rx_desc_ops = &qca99x0_rx_desc_ops, .hw_ops = &qca99x0_ops, .decap_align_bytes = 1, .spectral_bin_discard = 12, .spectral_bin_offset = 8, /* Can do only 1x1 VHT160 or 80+80. 780Mbps is 2x2 80Mhz or * 1x1 160Mhz, long-guard-interval. */ .vht160_mcs_rx_highest = 780, .vht160_mcs_tx_highest = 780, .n_cipher_suites = 11, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA9377_HW_1_0_DEV_VERSION, .dev_id = QCA9377_1_0_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca9377 hw1.0", .patch_load_addr = QCA9377_HW_1_0_PATCH_LOAD_ADDR, .uart_pin = 6, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 8124, .fw = { .dir = QCA9377_HW_1_0_FW_DIR, .board_size = QCA9377_BOARD_DATA_SZ, .board_ext_size = QCA9377_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca988x_ops, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA9377_HW_1_1_DEV_VERSION, .dev_id = QCA9377_1_0_DEVICE_ID, .bus = ATH10K_BUS_PCI, .name = "qca9377 hw1.1", .patch_load_addr = QCA9377_HW_1_0_PATCH_LOAD_ADDR, .uart_pin = 6, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 8124, .fw = { .dir = QCA9377_HW_1_0_FW_DIR, .board_size = QCA9377_BOARD_DATA_SZ, .board_ext_size = QCA9377_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca6174_ops, .hw_clk = qca6174_clk, .target_cpu_freq = 176000000, .decap_align_bytes = 4, .spectral_bin_discard = 0, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 8, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = true, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA9377_HW_1_1_DEV_VERSION, .dev_id = QCA9377_1_0_DEVICE_ID, .bus = ATH10K_BUS_SDIO, .name = "qca9377 hw1.1 sdio", .patch_load_addr = QCA9377_HW_1_0_PATCH_LOAD_ADDR, .uart_pin = 19, .led_pin = 0, .otp_exe_param = 0, .channel_counters_freq_hz = 88000, .max_probe_resp_desc_thres = 0, .cal_data_len = 8124, .fw = { .dir = QCA9377_HW_1_0_FW_DIR, .board_size = QCA9377_BOARD_DATA_SZ, .board_ext_size = QCA9377_BOARD_EXT_DATA_SZ, }, .rx_desc_ops = &qca988x_rx_desc_ops, .hw_ops = &qca6174_ops, .hw_clk = qca6174_clk, .target_cpu_freq = 176000000, .decap_align_bytes = 4, .n_cipher_suites = 8, .num_peers = TARGET_QCA9377_HL_NUM_PEERS, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .uart_pin_workaround = true, .credit_size_workaround = true, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = QCA4019_HW_1_0_DEV_VERSION, .dev_id = 0, .bus = ATH10K_BUS_AHB, .name = "qca4019 hw1.0", .patch_load_addr = QCA4019_HW_1_0_PATCH_LOAD_ADDR, .uart_pin = 7, .led_pin = 0, .cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_EACH, .otp_exe_param = 0x0010000, .continuous_frag_desc = true, .cck_rate_map_rev2 = true, .channel_counters_freq_hz = 125000, .max_probe_resp_desc_thres = 24, .tx_chain_mask = 0x3, .rx_chain_mask = 0x3, .max_spatial_stream = 2, .cal_data_len = 12064, .fw = { .dir = QCA4019_HW_1_0_FW_DIR, .board_size = QCA4019_BOARD_DATA_SZ, .board_ext_size = QCA4019_BOARD_EXT_DATA_SZ, }, .sw_decrypt_mcast_mgmt = true, .rx_desc_ops = &qca99x0_rx_desc_ops, .hw_ops = &qca99x0_ops, .decap_align_bytes = 1, .spectral_bin_discard = 4, .spectral_bin_offset = 0, .vht160_mcs_rx_highest = 0, .vht160_mcs_tx_highest = 0, .n_cipher_suites = 11, .ast_skid_limit = 0x10, .num_wds_entries = 0x20, .target_64bit = false, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL, .shadow_reg_support = false, .rri_on_ddr = false, .hw_filter_reset_required = true, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = false, .hw_restart_disconnect = false, .use_fw_tx_credits = true, .delay_unmap_buffer = false, .mcast_frame_registration = false, }, { .id = WCN3990_HW_1_0_DEV_VERSION, .dev_id = 0, .bus = ATH10K_BUS_SNOC, .name = "wcn3990 hw1.0", .led_pin = 0, .continuous_frag_desc = true, .tx_chain_mask = 0x7, .rx_chain_mask = 0x7, .max_spatial_stream = 4, .fw = { .dir = WCN3990_HW_1_0_FW_DIR, .board_size = WCN3990_BOARD_DATA_SZ, .board_ext_size = WCN3990_BOARD_EXT_DATA_SZ, }, .sw_decrypt_mcast_mgmt = true, .rx_desc_ops = &wcn3990_rx_desc_ops, .hw_ops = &wcn3990_ops, .decap_align_bytes = 1, .num_peers = TARGET_HL_TLV_NUM_PEERS, .n_cipher_suites = 11, .ast_skid_limit = TARGET_HL_TLV_AST_SKID_LIMIT, .num_wds_entries = TARGET_HL_TLV_NUM_WDS_ENTRIES, .target_64bit = true, .rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL_DUAL_MAC, .shadow_reg_support = true, .rri_on_ddr = true, .hw_filter_reset_required = false, .fw_diag_ce_download = false, .credit_size_workaround = false, .tx_stats_over_pktlog = false, .dynamic_sar_support = true, .hw_restart_disconnect = true, .use_fw_tx_credits = false, .delay_unmap_buffer = true, .mcast_frame_registration = false, }, }; static const char *const ath10k_core_fw_feature_str[] = { [ATH10K_FW_FEATURE_EXT_WMI_MGMT_RX] = "wmi-mgmt-rx", [ATH10K_FW_FEATURE_WMI_10X] = "wmi-10.x", [ATH10K_FW_FEATURE_HAS_WMI_MGMT_TX] = "has-wmi-mgmt-tx", [ATH10K_FW_FEATURE_NO_P2P] = "no-p2p", [ATH10K_FW_FEATURE_WMI_10_2] = "wmi-10.2", [ATH10K_FW_FEATURE_MULTI_VIF_PS_SUPPORT] = "multi-vif-ps", [ATH10K_FW_FEATURE_WOWLAN_SUPPORT] = "wowlan", [ATH10K_FW_FEATURE_IGNORE_OTP_RESULT] = "ignore-otp", [ATH10K_FW_FEATURE_NO_NWIFI_DECAP_4ADDR_PADDING] = "no-4addr-pad", [ATH10K_FW_FEATURE_SUPPORTS_SKIP_CLOCK_INIT] = "skip-clock-init", [ATH10K_FW_FEATURE_RAW_MODE_SUPPORT] = "raw-mode", [ATH10K_FW_FEATURE_SUPPORTS_ADAPTIVE_CCA] = "adaptive-cca", [ATH10K_FW_FEATURE_MFP_SUPPORT] = "mfp", [ATH10K_FW_FEATURE_PEER_FLOW_CONTROL] = "peer-flow-ctrl", [ATH10K_FW_FEATURE_BTCOEX_PARAM] = "btcoex-param", [ATH10K_FW_FEATURE_SKIP_NULL_FUNC_WAR] = "skip-null-func-war", [ATH10K_FW_FEATURE_ALLOWS_MESH_BCAST] = "allows-mesh-bcast", [ATH10K_FW_FEATURE_NO_PS] = "no-ps", [ATH10K_FW_FEATURE_MGMT_TX_BY_REF] = "mgmt-tx-by-reference", [ATH10K_FW_FEATURE_NON_BMI] = "non-bmi", [ATH10K_FW_FEATURE_SINGLE_CHAN_INFO_PER_CHANNEL] = "single-chan-info-per-channel", [ATH10K_FW_FEATURE_PEER_FIXED_RATE] = "peer-fixed-rate", [ATH10K_FW_FEATURE_IRAM_RECOVERY] = "iram-recovery", }; static unsigned int ath10k_core_get_fw_feature_str(char *buf, size_t buf_len, enum ath10k_fw_features feat) { /* make sure that ath10k_core_fw_feature_str[] gets updated */ BUILD_BUG_ON(ARRAY_SIZE(ath10k_core_fw_feature_str) != ATH10K_FW_FEATURE_COUNT); if (feat >= ARRAY_SIZE(ath10k_core_fw_feature_str) || WARN_ON(!ath10k_core_fw_feature_str[feat])) { return scnprintf(buf, buf_len, "bit%d", feat); } return scnprintf(buf, buf_len, "%s", ath10k_core_fw_feature_str[feat]); } void ath10k_core_get_fw_features_str(struct ath10k *ar, char *buf, size_t buf_len) { size_t len = 0; int i; for (i = 0; i < ATH10K_FW_FEATURE_COUNT; i++) { if (test_bit(i, ar->normal_mode_fw.fw_file.fw_features)) { if (len > 0) len += scnprintf(buf + len, buf_len - len, ","); len += ath10k_core_get_fw_feature_str(buf + len, buf_len - len, i); } } } static void ath10k_send_suspend_complete(struct ath10k *ar) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot suspend complete\n"); complete(&ar->target_suspend); } static int ath10k_init_sdio(struct ath10k *ar, enum ath10k_firmware_mode mode) { bool mtu_workaround = ar->hw_params.credit_size_workaround; int ret; u32 param = 0; ret = ath10k_bmi_write32(ar, hi_mbox_io_block_sz, 256); if (ret) return ret; ret = ath10k_bmi_write32(ar, hi_mbox_isr_yield_limit, 99); if (ret) return ret; ret = ath10k_bmi_read32(ar, hi_acs_flags, &param); if (ret) return ret; param |= HI_ACS_FLAGS_SDIO_REDUCE_TX_COMPL_SET; if (mode == ATH10K_FIRMWARE_MODE_NORMAL && !mtu_workaround) param |= HI_ACS_FLAGS_ALT_DATA_CREDIT_SIZE; else param &= ~HI_ACS_FLAGS_ALT_DATA_CREDIT_SIZE; if (mode == ATH10K_FIRMWARE_MODE_UTF) param &= ~HI_ACS_FLAGS_SDIO_SWAP_MAILBOX_SET; else param |= HI_ACS_FLAGS_SDIO_SWAP_MAILBOX_SET; ret = ath10k_bmi_write32(ar, hi_acs_flags, param); if (ret) return ret; ret = ath10k_bmi_read32(ar, hi_option_flag2, &param); if (ret) return ret; param |= HI_OPTION_SDIO_CRASH_DUMP_ENHANCEMENT_HOST; ret = ath10k_bmi_write32(ar, hi_option_flag2, param); if (ret) return ret; return 0; } static int ath10k_init_configure_target(struct ath10k *ar) { u32 param_host; int ret; /* tell target which HTC version it is used*/ ret = ath10k_bmi_write32(ar, hi_app_host_interest, HTC_PROTOCOL_VERSION); if (ret) { ath10k_err(ar, "settings HTC version failed\n"); return ret; } /* set the firmware mode to STA/IBSS/AP */ ret = ath10k_bmi_read32(ar, hi_option_flag, &param_host); if (ret) { ath10k_err(ar, "setting firmware mode (1/2) failed\n"); return ret; } /* TODO following parameters need to be re-visited. */ /* num_device */ param_host |= (1 << HI_OPTION_NUM_DEV_SHIFT); /* Firmware mode */ /* FIXME: Why FW_MODE_AP ??.*/ param_host |= (HI_OPTION_FW_MODE_AP << HI_OPTION_FW_MODE_SHIFT); /* mac_addr_method */ param_host |= (1 << HI_OPTION_MAC_ADDR_METHOD_SHIFT); /* firmware_bridge */ param_host |= (0 << HI_OPTION_FW_BRIDGE_SHIFT); /* fwsubmode */ param_host |= (0 << HI_OPTION_FW_SUBMODE_SHIFT); ret = ath10k_bmi_write32(ar, hi_option_flag, param_host); if (ret) { ath10k_err(ar, "setting firmware mode (2/2) failed\n"); return ret; } /* We do all byte-swapping on the host */ ret = ath10k_bmi_write32(ar, hi_be, 0); if (ret) { ath10k_err(ar, "setting host CPU BE mode failed\n"); return ret; } /* FW descriptor/Data swap flags */ ret = ath10k_bmi_write32(ar, hi_fw_swap, 0); if (ret) { ath10k_err(ar, "setting FW data/desc swap flags failed\n"); return ret; } /* Some devices have a special sanity check that verifies the PCI * Device ID is written to this host interest var. It is known to be * required to boot QCA6164. */ ret = ath10k_bmi_write32(ar, hi_hci_uart_pwr_mgmt_params_ext, ar->dev_id); if (ret) { ath10k_err(ar, "failed to set pwr_mgmt_params: %d\n", ret); return ret; } return 0; } static const struct firmware *ath10k_fetch_fw_file(struct ath10k *ar, const char *dir, const char *file) { char filename[100]; const struct firmware *fw; int ret; if (file == NULL) return ERR_PTR(-ENOENT); if (dir == NULL) dir = "."; if (ar->board_name) { snprintf(filename, sizeof(filename), "%s/%s/%s", dir, ar->board_name, file); ret = firmware_request_nowarn(&fw, filename, ar->dev); ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot fw request '%s': %d\n", filename, ret); if (!ret) return fw; } snprintf(filename, sizeof(filename), "%s/%s", dir, file); ret = firmware_request_nowarn(&fw, filename, ar->dev); ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot fw request '%s': %d\n", filename, ret); if (ret) return ERR_PTR(ret); return fw; } static int ath10k_push_board_ext_data(struct ath10k *ar, const void *data, size_t data_len) { u32 board_data_size = ar->hw_params.fw.board_size; u32 board_ext_data_size = ar->hw_params.fw.board_ext_size; u32 board_ext_data_addr; int ret; ret = ath10k_bmi_read32(ar, hi_board_ext_data, &board_ext_data_addr); if (ret) { ath10k_err(ar, "could not read board ext data addr (%d)\n", ret); return ret; } ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot push board extended data addr 0x%x\n", board_ext_data_addr); if (board_ext_data_addr == 0) return 0; if (data_len != (board_data_size + board_ext_data_size)) { ath10k_err(ar, "invalid board (ext) data sizes %zu != %d+%d\n", data_len, board_data_size, board_ext_data_size); return -EINVAL; } ret = ath10k_bmi_write_memory(ar, board_ext_data_addr, data + board_data_size, board_ext_data_size); if (ret) { ath10k_err(ar, "could not write board ext data (%d)\n", ret); return ret; } ret = ath10k_bmi_write32(ar, hi_board_ext_data_config, (board_ext_data_size << 16) | 1); if (ret) { ath10k_err(ar, "could not write board ext data bit (%d)\n", ret); return ret; } return 0; } static int ath10k_core_get_board_id_from_otp(struct ath10k *ar) { u32 result, address; u8 board_id, chip_id; bool ext_bid_support; int ret, bmi_board_id_param; address = ar->hw_params.patch_load_addr; if (!ar->normal_mode_fw.fw_file.otp_data || !ar->normal_mode_fw.fw_file.otp_len) { ath10k_warn(ar, "failed to retrieve board id because of invalid otp\n"); return -ENODATA; } if (ar->id.bmi_ids_valid) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot already acquired valid otp board id,skip download, board_id %d chip_id %d\n", ar->id.bmi_board_id, ar->id.bmi_chip_id); goto skip_otp_download; } ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot upload otp to 0x%x len %zd for board id\n", address, ar->normal_mode_fw.fw_file.otp_len); ret = ath10k_bmi_fast_download(ar, address, ar->normal_mode_fw.fw_file.otp_data, ar->normal_mode_fw.fw_file.otp_len); if (ret) { ath10k_err(ar, "could not write otp for board id check: %d\n", ret); return ret; } if (ar->cal_mode == ATH10K_PRE_CAL_MODE_DT || ar->cal_mode == ATH10K_PRE_CAL_MODE_FILE || ar->cal_mode == ATH10K_PRE_CAL_MODE_NVMEM) bmi_board_id_param = BMI_PARAM_GET_FLASH_BOARD_ID; else bmi_board_id_param = BMI_PARAM_GET_EEPROM_BOARD_ID; ret = ath10k_bmi_execute(ar, address, bmi_board_id_param, &result); if (ret) { ath10k_err(ar, "could not execute otp for board id check: %d\n", ret); return ret; } board_id = MS(result, ATH10K_BMI_BOARD_ID_FROM_OTP); chip_id = MS(result, ATH10K_BMI_CHIP_ID_FROM_OTP); ext_bid_support = (result & ATH10K_BMI_EXT_BOARD_ID_SUPPORT); ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot get otp board id result 0x%08x board_id %d chip_id %d ext_bid_support %d\n", result, board_id, chip_id, ext_bid_support); ar->id.ext_bid_supported = ext_bid_support; if ((result & ATH10K_BMI_BOARD_ID_STATUS_MASK) != 0 || (board_id == 0)) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "board id does not exist in otp, ignore it\n"); return -EOPNOTSUPP; } ar->id.bmi_ids_valid = true; ar->id.bmi_board_id = board_id; ar->id.bmi_chip_id = chip_id; skip_otp_download: return 0; } static void ath10k_core_check_bdfext(const struct dmi_header *hdr, void *data) { struct ath10k *ar = data; const char *bdf_ext; const char *magic = ATH10K_SMBIOS_BDF_EXT_MAGIC; u8 bdf_enabled; int i; if (hdr->type != ATH10K_SMBIOS_BDF_EXT_TYPE) return; if (hdr->length != ATH10K_SMBIOS_BDF_EXT_LENGTH) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "wrong smbios bdf ext type length (%d).\n", hdr->length); return; } bdf_enabled = *((u8 *)hdr + ATH10K_SMBIOS_BDF_EXT_OFFSET); if (!bdf_enabled) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "bdf variant name not found.\n"); return; } /* Only one string exists (per spec) */ bdf_ext = (char *)hdr + hdr->length; if (memcmp(bdf_ext, magic, strlen(magic)) != 0) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "bdf variant magic does not match.\n"); return; } for (i = 0; i < strlen(bdf_ext); i++) { if (!isascii(bdf_ext[i]) || !isprint(bdf_ext[i])) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "bdf variant name contains non ascii chars.\n"); return; } } /* Copy extension name without magic suffix */ if (strscpy(ar->id.bdf_ext, bdf_ext + strlen(magic), sizeof(ar->id.bdf_ext)) < 0) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "bdf variant string is longer than the buffer can accommodate (variant: %s)\n", bdf_ext); return; } ath10k_dbg(ar, ATH10K_DBG_BOOT, "found and validated bdf variant smbios_type 0x%x bdf %s\n", ATH10K_SMBIOS_BDF_EXT_TYPE, bdf_ext); } static int ath10k_core_check_smbios(struct ath10k *ar) { ar->id.bdf_ext[0] = '\0'; dmi_walk(ath10k_core_check_bdfext, ar); if (ar->id.bdf_ext[0] == '\0') return -ENODATA; return 0; } int ath10k_core_check_dt(struct ath10k *ar) { struct device_node *node; const char *variant = NULL; node = ar->dev->of_node; if (!node) return -ENOENT; of_property_read_string(node, "qcom,ath10k-calibration-variant", &variant); if (!variant) return -ENODATA; if (strscpy(ar->id.bdf_ext, variant, sizeof(ar->id.bdf_ext)) < 0) ath10k_dbg(ar, ATH10K_DBG_BOOT, "bdf variant string is longer than the buffer can accommodate (variant: %s)\n", variant); return 0; } EXPORT_SYMBOL(ath10k_core_check_dt); static int ath10k_download_fw(struct ath10k *ar) { u32 address, data_len; const void *data; int ret; struct pm_qos_request latency_qos; address = ar->hw_params.patch_load_addr; data = ar->running_fw->fw_file.firmware_data; data_len = ar->running_fw->fw_file.firmware_len; ret = ath10k_swap_code_seg_configure(ar, &ar->running_fw->fw_file); if (ret) { ath10k_err(ar, "failed to configure fw code swap: %d\n", ret); return ret; } ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot uploading firmware image %pK len %d\n", data, data_len); /* Check if device supports to download firmware via * diag copy engine. Downloading firmware via diag CE * greatly reduces the time to download firmware. */ if (ar->hw_params.fw_diag_ce_download) { ret = ath10k_hw_diag_fast_download(ar, address, data, data_len); if (ret == 0) /* firmware upload via diag ce was successful */ return 0; ath10k_warn(ar, "failed to upload firmware via diag ce, trying BMI: %d", ret); } memset(&latency_qos, 0, sizeof(latency_qos)); cpu_latency_qos_add_request(&latency_qos, 0); ret = ath10k_bmi_fast_download(ar, address, data, data_len); cpu_latency_qos_remove_request(&latency_qos); return ret; } void ath10k_core_free_board_files(struct ath10k *ar) { if (!IS_ERR(ar->normal_mode_fw.board)) release_firmware(ar->normal_mode_fw.board); if (!IS_ERR(ar->normal_mode_fw.ext_board)) release_firmware(ar->normal_mode_fw.ext_board); ar->normal_mode_fw.board = NULL; ar->normal_mode_fw.board_data = NULL; ar->normal_mode_fw.board_len = 0; ar->normal_mode_fw.ext_board = NULL; ar->normal_mode_fw.ext_board_data = NULL; ar->normal_mode_fw.ext_board_len = 0; } EXPORT_SYMBOL(ath10k_core_free_board_files); static void ath10k_core_free_firmware_files(struct ath10k *ar) { if (!IS_ERR(ar->normal_mode_fw.fw_file.firmware)) release_firmware(ar->normal_mode_fw.fw_file.firmware); if (!IS_ERR(ar->cal_file)) release_firmware(ar->cal_file); if (!IS_ERR(ar->pre_cal_file)) release_firmware(ar->pre_cal_file); ath10k_swap_code_seg_release(ar, &ar->normal_mode_fw.fw_file); ar->normal_mode_fw.fw_file.otp_data = NULL; ar->normal_mode_fw.fw_file.otp_len = 0; ar->normal_mode_fw.fw_file.firmware = NULL; ar->normal_mode_fw.fw_file.firmware_data = NULL; ar->normal_mode_fw.fw_file.firmware_len = 0; ar->cal_file = NULL; ar->pre_cal_file = NULL; } static int ath10k_fetch_cal_file(struct ath10k *ar) { char filename[100]; /* pre-cal-<bus>-<id>.bin */ scnprintf(filename, sizeof(filename), "pre-cal-%s-%s.bin", ath10k_bus_str(ar->hif.bus), dev_name(ar->dev)); ar->pre_cal_file = ath10k_fetch_fw_file(ar, ATH10K_FW_DIR, filename); if (!IS_ERR(ar->pre_cal_file)) goto success; /* cal-<bus>-<id>.bin */ scnprintf(filename, sizeof(filename), "cal-%s-%s.bin", ath10k_bus_str(ar->hif.bus), dev_name(ar->dev)); ar->cal_file = ath10k_fetch_fw_file(ar, ATH10K_FW_DIR, filename); if (IS_ERR(ar->cal_file)) /* calibration file is optional, don't print any warnings */ return PTR_ERR(ar->cal_file); success: ath10k_dbg(ar, ATH10K_DBG_BOOT, "found calibration file %s/%s\n", ATH10K_FW_DIR, filename); return 0; } static int ath10k_core_fetch_board_data_api_1(struct ath10k *ar, int bd_ie_type) { const struct firmware *fw; char boardname[100]; if (bd_ie_type == ATH10K_BD_IE_BOARD) { scnprintf(boardname, sizeof(boardname), "board-%s-%s.bin", ath10k_bus_str(ar->hif.bus), dev_name(ar->dev)); ar->normal_mode_fw.board = ath10k_fetch_fw_file(ar, ar->hw_params.fw.dir, boardname); if (IS_ERR(ar->normal_mode_fw.board)) { fw = ath10k_fetch_fw_file(ar, ar->hw_params.fw.dir, ATH10K_BOARD_DATA_FILE); ar->normal_mode_fw.board = fw; } if (IS_ERR(ar->normal_mode_fw.board)) return PTR_ERR(ar->normal_mode_fw.board); ar->normal_mode_fw.board_data = ar->normal_mode_fw.board->data; ar->normal_mode_fw.board_len = ar->normal_mode_fw.board->size; } else if (bd_ie_type == ATH10K_BD_IE_BOARD_EXT) { fw = ath10k_fetch_fw_file(ar, ar->hw_params.fw.dir, ATH10K_EBOARD_DATA_FILE); ar->normal_mode_fw.ext_board = fw; if (IS_ERR(ar->normal_mode_fw.ext_board)) return PTR_ERR(ar->normal_mode_fw.ext_board); ar->normal_mode_fw.ext_board_data = ar->normal_mode_fw.ext_board->data; ar->normal_mode_fw.ext_board_len = ar->normal_mode_fw.ext_board->size; } return 0; } static int ath10k_core_parse_bd_ie_board(struct ath10k *ar, const void *buf, size_t buf_len, const char *boardname, int bd_ie_type) { const struct ath10k_fw_ie *hdr; bool name_match_found; int ret, board_ie_id; size_t board_ie_len; const void *board_ie_data; name_match_found = false; /* go through ATH10K_BD_IE_BOARD_ elements */ while (buf_len > sizeof(struct ath10k_fw_ie)) { hdr = buf; board_ie_id = le32_to_cpu(hdr->id); board_ie_len = le32_to_cpu(hdr->len); board_ie_data = hdr->data; buf_len -= sizeof(*hdr); buf += sizeof(*hdr); if (buf_len < ALIGN(board_ie_len, 4)) { ath10k_err(ar, "invalid ATH10K_BD_IE_BOARD length: %zu < %zu\n", buf_len, ALIGN(board_ie_len, 4)); ret = -EINVAL; goto out; } switch (board_ie_id) { case ATH10K_BD_IE_BOARD_NAME: ath10k_dbg_dump(ar, ATH10K_DBG_BOOT, "board name", "", board_ie_data, board_ie_len); if (board_ie_len != strlen(boardname)) break; ret = memcmp(board_ie_data, boardname, strlen(boardname)); if (ret) break; name_match_found = true; ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot found match for name '%s'", boardname); break; case ATH10K_BD_IE_BOARD_DATA: if (!name_match_found) /* no match found */ break; if (bd_ie_type == ATH10K_BD_IE_BOARD) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot found board data for '%s'", boardname); ar->normal_mode_fw.board_data = board_ie_data; ar->normal_mode_fw.board_len = board_ie_len; } else if (bd_ie_type == ATH10K_BD_IE_BOARD_EXT) { ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot found eboard data for '%s'", boardname); ar->normal_mode_fw.ext_board_data = board_ie_data; ar->normal_mode_fw.ext_board_len = board_ie_len; } ret = 0; goto out; default: ath10k_warn(ar, "unknown ATH10K_BD_IE_BOARD found: %d\n", board_ie_id); break; } /* jump over the padding */ board_ie_len = ALIGN(board_ie_len, 4); buf_len -= board_ie_len; buf += board_ie_len; } /* no match found */ ret = -ENOENT; out: return ret; } static int ath10k_core_search_bd(struct ath10k *ar, const char *boardname, const u8 *data, size_t len) { size_t ie_len; struct ath10k_fw_ie *hdr; int ret = -ENOENT, ie_id; while (len > sizeof(struct ath10k_fw_ie)) { hdr = (struct ath10k_fw_ie *)data; ie_id = le32_to_cpu(hdr->id); ie_len = le32_to_cpu(hdr->len); len -= sizeof(*hdr); data = hdr->data; if (len < ALIGN(ie_len, 4)) { ath10k_err(ar, "invalid length for board ie_id %d ie_len %zu len %zu\n", ie_id, ie_len, len); return -EINVAL; } switch (ie_id) { case ATH10K_BD_IE_BOARD: ret = ath10k_core_parse_bd_ie_board(ar, data, ie_len, boardname, ATH10K_BD_IE_BOARD); if (ret == -ENOENT) /* no match found, continue */ break; /* either found or error, so stop searching */ goto out; case ATH10K_BD_IE_BOARD_EXT: ret = ath10k_core_parse_bd_ie_board(ar, data, ie_len, boardname, ATH10K_BD_IE_BOARD_EXT); if (ret == -ENOENT) /* no match found, continue */ break; /* either found or error, so stop searching */ goto out; } /* jump over the padding */ ie_len = ALIGN(ie_len, 4); len -= ie_len; data += ie_len; } out: /* return result of parse_bd_ie_board() or -ENOENT */ return ret; } static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar, const char *boardname, const char *fallback_boardname1, const char *fallback_boardname2, const char *filename) { size_t len, magic_len; const u8 *data; int ret; /* Skip if already fetched during board data download */ if (!ar->normal_mode_fw.board) ar->normal_mode_fw.board = ath10k_fetch_fw_file(ar, ar->hw_params.fw.dir, filename); if (IS_ERR(ar->normal_mode_fw.board)) return PTR_ERR(ar->normal_mode_fw.board); data = ar->normal_mode_fw.board->data; len = ar->normal_mode_fw.board->size; /* magic has extra null byte padded */ magic_len = strlen(ATH10K_BOARD_MAGIC) + 1; if (len < magic_len) { ath10k_err(ar, "failed to find magic value in %s/%s, file too short: %zu\n", ar->hw_params.fw.dir, filename, len); ret = -EINVAL; goto err; } if (memcmp(data, ATH10K_BOARD_MAGIC, magic_len)) { ath10k_err(ar, "found invalid board magic\n"); ret = -EINVAL; goto err; } /* magic is padded to 4 bytes */ magic_len = ALIGN(magic_len, 4); if (len < magic_len) { ath10k_err(ar, "failed: %s/%s too small to contain board data, len: %zu\n", ar->hw_params.fw.dir, filename, len); ret = -EINVAL; goto err; } data += magic_len; len -= magic_len; /* attempt to find boardname in the IE list */ ret = ath10k_core_search_bd(ar, boardname, data, len); /* if we didn't find