Total coverage: 15110 (2%)of 1197771
462 319 255 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_BL_H #define _LINUX_RCULIST_BL_H /* * RCU-protected bl list version. See include/linux/list_bl.h. */ #include <linux/list_bl.h> #include <linux/rcupdate.h> static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); rcu_assign_pointer(h->first, (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); } /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_bl_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry(). */ static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) { __hlist_bl_del(n); n->pprev = LIST_POISON2; } /** * hlist_bl_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_bl, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_bl_add_head_rcu() * or hlist_bl_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first; /* don't need hlist_bl_first_rcu because we're under lock */ first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; /* need _rcu because we can have concurrent lock free readers */ hlist_bl_set_first_rcu(h, n); } /** * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_bl_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_bl_node within the struct. * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ for (pos = hlist_bl_first_rcu(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(pos->next)) #endif
18 154 196 180 41 66 3 3 3 3 28 32 17 239 35 80 10 10 80 10 10 49 8 41 8 41 239 103 163 45 98 239 102 103 163 80 80 68 1 165 45 38 43 31 48 11 2 9 152 283 117 161 40 26 3 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> #include <linux/file_ref.h> #include <linux/unicode.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) /* File supports atomic writes */ #define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) /* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* FMODE_* bit 13 */ /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) #define FMODE_OPENED ((__force fmode_t)(1 << 19)) #define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) /* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)(1 << 24)) /* * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) /* * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * The two FMODE_NONOTIFY* define which fsnotify events should not be generated * for a file. These are the possible values of (f->f_mode & * FMODE_FSNOTIFY_MASK) and their meaning: * * FMODE_NONOTIFY - suppress all (incl. non-permission) events. * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. */ #define FMODE_FSNOTIFY_MASK \ (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) #define FMODE_FSNOTIFY_NONE(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS #define FMODE_FSNOTIFY_PERM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) #define FMODE_FSNOTIFY_HSM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0) #else #define FMODE_FSNOTIFY_PERM(mode) 0 #define FMODE_FSNOTIFY_HSM(mode) 0 #endif /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) #define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; /* * The two anonymous unions wrap structures with the same member. * * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which * are a dedicated type requiring the filesystem to use the dedicated * helpers. Other filesystem can continue to use ia_{g,u}id until they * have been ported. * * They always contain the same value. In other words FS_ALLOW_IDMAP * pass down the same value on idmapped mounts as they would on regular * mounts. */ union { kuid_t ia_uid; vfsuid_t ia_vfsuid; }; union { kgid_t ia_gid; vfsgid_t ia_vfsgid; }; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Includes for diskquotas. */ #include <linux/quota.h> /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe * context rather than needing to be punted through a workqueue. If this * flag is set, the bio completion handling may set iocb->dio_complete to a * handler function and iocb->private to context information for that handler. * The issuer should call the handler with that context information from task * context to complete the processing of the iocb. Note that while this * provides a task context for the dio_complete() callback, it should only be * used on the completion side for non-IO generating completions. It's fine to * call blocking functions from this callback, but they should not wait for * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) #define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ { IOCB_HIPRI, "HIPRI" }, \ { IOCB_DSYNC, "DSYNC" }, \ { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ { IOCB_ATOMIC, "ATOMIC" }, \ { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } struct kiocb { struct file *ki_filp; loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ union { /* * Only used for async buffered reads, where it denotes the * page waitqueue associated with completing the read. Valid * IFF IOCB_WAITQ is set. */ struct wait_page_queue *ki_waitq; /* * Can be used for O_DIRECT IO, where the completion handling * is punted back to the issuer of the IO. May only be set * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer * must then check for presence of this handler when ki_complete * is invoked. The data passed in to this handler must be * assigned to ->private when dio_complete is assigned. */ ssize_t (*dio_complete)(void *data); }; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @invalidate_lock: Guards coherency between page cache contents and * file offset->disk block mappings in the filesystem during invalidates. * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. * @i_private_list: For use by the owner of the address_space. * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; struct rw_semaphore invalidate_lock; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_read(struct address_space *mapping) { return down_read_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to * cache the ACL. This also means that ->get_inode_acl() can be called in RCU * mode with the LOOKUP_RCU flag. */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; time64_t i_atime_sec; time64_t i_mtime_sec; time64_t i_ctime_sec; u32 i_atime_nsec; u32 i_mtime_nsec; u32 i_ctime_nsec; u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ u32 i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; union { struct list_head i_devices; int i_linklen; }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; } /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. */ #define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit); static inline void inode_wake_up_bit(struct inode *inode, u32 bit) { /* Caller is responsible for correct memory barriers. */ wake_up_var(inode_state_wait_address(inode, bit)); } struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } /* * inode->i_mutex nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } static inline void filemap_invalidate_lock(struct address_space *mapping) { down_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock(struct address_space *mapping) { up_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_lock_shared(struct address_space *mapping) { down_read(&mapping->invalidate_lock); } static inline int filemap_invalidate_trylock_shared( struct address_space *mapping) { return down_read_trylock(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock_shared( struct address_space *mapping) { up_read(&mapping->invalidate_lock); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2); void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else /* Pairs with smp_store_release() in i_size_write() */ return smp_load_acquire(&inode->i_size); #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else /* * Pairs with smp_load_acquire() in i_size_read() to ensure * changes related to inode size (such as page contents) are * visible before we see the changed inode size. */ smp_store_release(&inode->i_size, i_size); #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /** * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. * @async_size: Numer of pages that were/are not needed immediately * and so were/are genuinely "ahead". Start next readahead when * the first of these pages is accessed. * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; unsigned int size; unsigned int async_size; unsigned int ra_pages; unsigned int mmap_miss; loff_t prev_pos; }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } /** * struct file - Represents a file * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations * @f_mapping: Contents of a cacheable, mappable object. * @private_data: filesystem or driver specific data * @f_inode: cached inode * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) * @f_ref: reference count */ struct file { spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; struct address_space *f_mapping; void *private_data; struct inode *f_inode; unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; /* pipes */ u64 f_pipe; }; loff_t f_pos; #ifdef CONFIG_SECURITY void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL struct hlist_head *f_ep; #endif union { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; freeptr_t f_freeptr; }; file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) { file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); #define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) #define OFFT_OFFSET_MAX type_max(off_t) #endif int file_f_owner_allocate(struct file *file); static inline struct fown_struct *file_f_owner(const struct file *file) { return READ_ONCE(file->f_owner); } extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } /* * file_dentry() is a relic from the days that overlayfs was using files with a * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. * In those days, file_dentry() was needed to get the underlying fs dentry that * matches f_inode. * Files with "fake" path should not exist nowadays, so use an assertion to make * sure that file_dentry() was not papering over filesystem bugs. */ static inline struct dentry *file_dentry(const struct file *file) { struct dentry *dentry = file->f_path.dentry; WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); return dentry; } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ #define SB_RDONLY BIT(0) /* Mount read-only */ #define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ #define SB_NODEV BIT(2) /* Disallow access to device special files */ #define SB_NOEXEC BIT(3) /* Disallow program execution */ #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) #define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define SB_DEAD BIT(21) #define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) #define SB_ACTIVE BIT(30) #define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) #define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) #if IS_ENABLED(CONFIG_UNICODE) #define sb_no_casefold_compat_fallback(sb) \ (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) #else #define sb_no_casefold_compat_fallback(sb) (1) #endif /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ }; #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { unsigned short frozen; /* Is sb frozen? */ int freeze_kcount; /* How many kernel freeze requests? */ int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif const struct xattr_handler * const *s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsigned int s_quota_types; /* Bitmask of supported quota types */ struct quota_info s_dquot; /* Diskquota specific options */ struct sb_writers s_writers; /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ /* Granularity of c/m/atime in ns (cannot be worse than a second) */ u32 s_time_gran; /* Time limits for c/m/atime in seconds */ time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY u32 s_fsnotify_mask; struct fsnotify_sb_info *s_fsnotify_info; #endif /* * q: why are s_id and s_sysfs_name not the same? both are human * readable strings that identify the filesystem * a: s_id is allowed to change at runtime; it's used in log messages, * and we want to when a device starts out as single device (s_id is dev * name) but then a device is hot added and we have to switch to * identifying it by UUID * but s_sysfs_name is a handle for programmatic access, and can't * change at runtime */ char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ char s_sysfs_name[UUID_STRING_LEN + 1]; unsigned int s_max_links; /* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned. */ struct mutex s_vfs_rename_mutex; /* Kludge */ /* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ const char *s_subtype; const struct dentry_operations *s_d_op; /* default d_op for dentries */ struct shrinker *s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; /* Read-only state of the superblock is being changed */ int s_readonly_remount; /* per-sb errseq_t for reporting writeback errors via syncfs */ errseq_t s_wb_err; /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; /* * Owning user namespace and default context in which to * interpret filesystem uids, gids, quotas, device nodes, * xattrs and security labels. */ struct user_namespace *s_user_ns; /* * The list_lru structure is essentially just a pointer to a table * of per-node lru lists, each of which has its own spinlock. * There is no need to put them into separate cachelines. */ struct list_lru s_dentry_lru; struct list_lru s_inode_lru; struct rcu_head rcu; struct work_struct destroy_work; struct mutex s_sync_lock; /* sync serialisation lock */ /* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; /* s_inode_list_lock protects s_inodes */ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; } /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem. */ static inline uid_t i_uid_read(const struct inode *inode) { return from_kuid(i_user_ns(inode), inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { return from_kgid(i_user_ns(inode), inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { inode->i_uid = make_kuid(i_user_ns(inode), uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { inode->i_gid = make_kgid(i_user_ns(inode), gid); } /** * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned. */ static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid); } /** * i_uid_needs_update - check whether inode's i_uid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_uid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_uid field needs to be updated, false if not. */ static inline bool i_uid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_UID) && !vfsuid_eq(attr->ia_vfsuid, i_uid_into_vfsuid(idmap, inode))); } /** * i_uid_update - update @inode's i_uid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid. */ static inline void i_uid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_UID) inode->i_uid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); } /** * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned. */ static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid); } /** * i_gid_needs_update - check whether inode's i_gid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_gid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_gid field needs to be updated, false if not. */ static inline bool i_gid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_GID) && !vfsgid_eq(attr->ia_vfsgid, i_gid_into_vfsgid(idmap, inode))); } /** * i_gid_update - update @inode's i_gid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid. */ static inline void i_gid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_GID) inode->i_gid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); } /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsuid according to @idmap. */ static inline void inode_fsuid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode)); } /** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsgid according to @idmap. */ static inline void inode_fsgid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode)); } /** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not. */ static inline bool fsuidgid_has_mapping(struct super_block *sb, struct mnt_idmap *idmap) { struct user_namespace *fs_userns = sb->s_user_ns; kuid_t kuid; kgid_t kgid; kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) return false; kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && kgid_has_mapping(fs_userns, kgid); } struct timespec64 current_time(struct inode *inode); struct timespec64 inode_set_ctime_current(struct inode *inode); struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update); static inline time64_t inode_get_atime_sec(const struct inode *inode) { return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), .tv_nsec = inode_get_atime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_atime_sec = ts.tv_sec; inode->i_atime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_atime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), .tv_nsec = inode_get_mtime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_mtime_sec = ts.tv_sec; inode->i_mtime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_mtime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_mtime_to_ts(inode, ts); } /* * Multigrain timestamps * * Conditionally use fine-grained ctime and mtime timestamps when there * are users actively observing them via getattr. The primary use-case * for this is NFS clients that use the ctime to distinguish between * different states of the file, and that are often fooled by multiple * operations that occur in the same coarse-grained timer tick. */ #define I_CTIME_QUERIED ((u32)BIT(31)) static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), .tv_nsec = inode_get_ctime_nsec(inode) }; return ts; } struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode * @inode: inode in which to set the ctime * @sec: tv_sec value to set * @nsec: tv_nsec value to set * * Set the ctime in @inode to { @sec, @nsec } */ static inline struct timespec64 inode_set_ctime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_ctime_to_ts(inode, ts); } struct timespec64 simple_inode_init_ts(struct inode *inode); /* * Snapshotting support. */ /* * These are internal functions, please use sb_start_{write,pagefault,intwrite} * instead. */ static inline void __sb_end_write(struct super_block *sb, int level) { percpu_up_read(sb->s_writers.rw_sem + level-1); } static inline void __sb_start_write(struct super_block *sb, int level) { percpu_down_read(sb->s_writers.rw_sem + level - 1); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) { return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); } #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) /** * __sb_write_started - check if sb freeze level is held * @sb: the super we write to * @level: the freeze level * * * > 0 - sb freeze level is held * * 0 - sb freeze level is not held * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) { return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); } /** * sb_write_started - check if SB_FREEZE_WRITE is held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE); } /** * sb_write_not_started - check if SB_FREEZE_WRITE is not held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_not_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; } /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_started(file_inode(file)->i_sb); } /** * file_write_not_started - check if SB_FREEZE_WRITE is not held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_not_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_not_started(file_inode(file)->i_sb); } /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem. */ static inline void sb_end_write(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_WRITE); } /** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem. */ static inline void sb_end_pagefault(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem. */ static inline void sb_end_intwrite(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_FS); } /** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_mutex (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_WRITE); } static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault */ static inline void sb_start_pagefault(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.). */ static inline void sb_start_intwrite(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_FS); } static inline bool sb_start_intwrite_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_FS); } bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); /** * struct renamedata - contains all information required for renaming * @old_mnt_idmap: idmap of the old mount the inode was found from * @old_dir: parent of source * @old_dentry: source * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { struct mnt_idmap *old_mnt_idmap; struct inode *old_dir; struct dentry *old_dentry; struct mnt_idmap *new_mnt_idmap; struct inode *new_dir; struct dentry *new_dentry; struct inode **delegated_inode; unsigned int flags; } __randomize_layout; int vfs_rename(struct renamedata *); static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define compat_ptr_ioctl NULL #endif /* * VFS file helper functions. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. * Return 'true' to keep going and 'false' if there are no more entries. */ struct dir_context; typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { filldir_t actor; loff_t pos; }; /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution */ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) /* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) /* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. * * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops */ #define COPY_FILE_SPLICE (1 << 0) struct iov_iter; struct io_uring_cmd; struct offset_ctx; typedef unsigned int __bitwise fop_flags_t; struct file_operations { struct module *owner; fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); int (*setlease)(struct file *, int, struct file_lease **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); } __randomize_layout; /* Supports async buffered reads */ #define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0)) /* Supports async buffered writes */ #define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1)) /* Supports synchronous page faults for mappings */ #define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2)) /* Supports non-exclusive O_DIRECT writes from multiple threads */ #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) /* Treat loff_t as unsigned (e.g., /dev/mem) */ #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) /* File system supports uncached read/write buffered IO */ #define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, int (*) (struct file *, struct dir_context *)); #define WRAP_DIR_ITER(x) \ static int shared_##x(struct file *file , struct dir_context *ctx) \ { return wrap_directory_iterator(file, ctx, x); } struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, int); int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { return file->f_op->mmap(file, vma); } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *dax_read_ops); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); /** * enum freeze_holder - holder of the freeze * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed * * Indicate who the owner of the freeze or thaw request is and whether * the freeze needs to be exclusive or can nest. * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the * same holder aren't allowed. It is however allowed to hold a single * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at * the same time. This is relied upon by some filesystems during online * repair or similar. */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *, enum freeze_holder who); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *, enum freeze_holder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot __rcu **(*get_dquots)(struct inode *); #endif long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); void (*shutdown)(struct super_block *sb); }; /* * Inode flags - they have no relation to superblock flags now */ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be * possible to override it selectively if you really wanted to with some * ioctl() that is not currently implemented. * * Exception: SB_RDONLY is always applied to the entire file system. * * Unfortunately, it is possible to change a filesystems flags with it mounted * with files in use. This means that all of the inodes will not have their * i_flags updated. Hence, i_flags no longer inherit the superblock mount * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) #define IS_DIRSYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \ ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK) #define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME) #define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) #ifdef CONFIG_FS_POSIX_ACL #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) #else #define IS_POSIXACL(inode) 0 #endif #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) #ifdef CONFIG_SWAP #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #else #define IS_SWAPFILE(inode) ((void)(inode), 0U) #endif #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) { return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)); } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = filp->f_iocb_flags, .ki_ioprio = get_current_ioprio(), }; } static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; } /* * Inode state bits. Protected by inode->i_lock * * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync() (unless I_DIRTY_DATASYNC is also set). * Timestamp updates are the usual cause. * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. * I_DIRTY_TIME The inode itself has dirty timestamps, and the * lazytime mount option is enabled. We keep track of this * separately from I_DIRTY_SYNC in order to implement * lazytime. This gets cleared if I_DIRTY_INODE * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already * in place because writeback might already be in progress * and we don't want to lose the time update * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and * wait for I_NEW to be released before returning. * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can * also cause waiting on I_NEW, without I_NEW actually * being set. find_inode() uses this to prevent returning * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. * I_FREEING Set when inode is about to be freed but still has dirty * pages or buffers attached or the inode itself is still * dirty. * I_CLEAR Added by clear_inode(). In this state the inode is * clean and can be destroyed. Inode keeps I_FREEING. * * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * * I_SYNC Writeback of inode is running. The bit is set during * data writeback, and cleared with a wakeup on the bit * address once it is done. The bit is also used to pin * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. * * I_CREATING New object's inode in the middle of setting up. * * I_DONTCACHE Evict inode as soon as it is not used anymore. * * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait * upon. There's one free address left. */ #define __I_NEW 0 #define I_NEW (1 << __I_NEW) #define __I_SYNC 1 #define I_SYNC (1 << __I_SYNC) #define __I_LRU_ISOLATING 2 #define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_SYNC (1 << 3) #define I_DIRTY_DATASYNC (1 << 4) #define I_DIRTY_PAGES (1 << 5) #define I_WILL_FREE (1 << 6) #define I_FREEING (1 << 7) #define I_CLEAR (1 << 8) #define I_REFERENCED (1 << 9) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define I_WB_SWITCH (1 << 12) #define I_OVL_INUSE (1 << 13) #define I_CREATING (1 << 14) #define I_DONTCACHE (1 << 15) #define I_SYNC_QUEUED (1 << 16) #define I_PINNING_NETFS_WB (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY); } static inline void mark_inode_dirty_sync(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY_SYNC); } /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. * Filesystems should call this if when writing an inode when lazytime is * enabled, they want to opportunistically write the timestamps of other inodes * located very nearby on-disk, e.g. in the same inode block. This returns true * if the given inode is in need of such an opportunistic update. Requires * i_lock, or at least later re-checking under i_lock. */ static inline bool inode_is_dirtytime_only(struct inode *inode) { return (inode->i_state & (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); extern void drop_nlink(struct inode *inode); extern void clear_nlink(struct inode *inode); extern void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { inc_nlink(inode); mark_inode_dirty(inode); } static inline void inode_dec_link_count(struct inode *inode) { drop_nlink(inode); mark_inode_dirty(inode); } enum file_time_flags { S_ATIME = 1, S_MTIME = 2, S_CTIME = 4, S_VERSION = 8, }; extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); int inode_update_time(struct inode *inode, int flags); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) touch_atime(&file->f_path); } extern int file_modified(struct file *file); int kiocb_modified(struct kiocb *iocb); int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; int fs_flags; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key invalidate_lock_key; struct lock_class_key i_mutex_dir_key; }; #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) /** * is_mgtime: is this inode using multigrain timestamps * @inode: inode to test for multigrain timestamps * * Return true if the inode uses multigrain timestamps, false otherwise. */ static inline bool is_mgtime(const struct inode *inode) { return inode->i_opflags & IOP_MGTIME; } extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int set_anon_super_fc(struct super_block *s, struct fs_context *fc); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) ({ \ const struct file_operations *_fops = (fops); \ (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \ }) #define fops_put(fops) ({ \ const struct file_operations *_fops = (fops); \ if (_fops) \ module_put((_fops)->owner); \ }) /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well. */ #define replace_fops(f, fops) \ do { \ struct file *__file = (f); \ fops_put(__file->f_op); \ BUG_ON(!(__file->f_op = (fops))); \ } while(0) extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); int freeze_super(struct super_block *super, enum freeze_holder who); int thaw_super(struct super_block *super, enum freeze_holder who); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len) { if (WARN_ON(len > sizeof(sb->s_uuid))) len = sizeof(sb->s_uuid); sb->s_uuid_len = len; memcpy(&sb->s_uuid, uuid, len); } /* set sb sysfs name based on sb->s_bdev */ static inline void super_set_sysfs_name_bdev(struct super_block *sb) { snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); } /* set sb sysfs name based on sb->s_uuid */ static inline void super_set_sysfs_name_uuid(struct super_block *sb) { WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); } /* set sb sysfs name based on sb->s_id */ static inline void super_set_sysfs_name_id(struct super_block *sb) { strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); } /* try to use something standard before you use this */ __printf(2, 3) static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); va_end(args); } extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); /* /sys/fs */ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) /* fs/open.c */ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ atomic_t refcnt; struct audit_names *aname; const char iname[]; }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); static inline struct mnt_idmap *file_mnt_idmap(const struct file *file) { return mnt_idmap(file->f_path.mnt); } /** * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped. * * Return: true if mount is mapped, false if not. */ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) { return mnt_idmap(mnt) != &nop_mnt_idmap; } int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, const char *, int, umode_t); static inline struct file *file_open_root_mnt(struct vfsmount *mnt, const char *name, int flags, umode_t mode) { return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, name, flags, mode); } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct path *backing_file_user_path(struct file *f); /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying * filesystem. When the mapped file path and inode number are displayed to * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the * path and inode number to display to the user, which is the path of the fd * that user has requested to map and the inode number that would be returned * by fstat() on that same fd. */ /* Get the path to display in /proc/<pid>/maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } /* Get the inode whose inode number to display in /proc/<pid>/maps */ static inline const struct inode *file_user_inode(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return d_inode(backing_file_user_path(f)->dentry); return file_inode(f); } static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); } extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); static inline struct filename *getname(const char __user *name) { return getname_flags(name, 0); } extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) { if (!(flags & AT_EMPTY_PATH)) return getname(name); if (!name) return NULL; return __getname_maybe_null(name); } extern void putname(struct filename *name); DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) static inline struct filename *refname(struct filename *name) { atomic_inc(&name->refcnt); return name; } extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); /* Helper for the simple case when original dentry is used */ static inline int finish_open_simple(struct file *file, int error) { if (error) return error; return finish_open(file, file->f_path.dentry, NULL); } /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) extern struct super_block *blockdev_superblock; static inline bool sb_is_blkdev_sb(struct super_block *sb) { return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; } void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; /* fs/char_dev.c */ #define CHRDEV_MAJOR_MAX 512 /* Marks the bottom of the first segment of free char majors */ #define CHRDEV_MAJOR_DYN_END 234 /* Marks the top and bottom of the second segment of free char majors */ #define CHRDEV_MAJOR_DYN_EXT_START 511 #define CHRDEV_MAJOR_DYN_EXT_END 384 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); extern int register_chrdev_region(dev_t, unsigned, const char *); extern int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops); extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name); extern void unregister_chrdev_region(dev_t, unsigned); extern void chrdev_show(struct seq_file *,off_t); static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { return __register_chrdev(major, 0, 256, name, fops); } static inline void unregister_chrdev(unsigned int major, const char *name) { __unregister_chrdev(major, 0, 256, name); } extern void init_special_inode(struct inode *, umode_t, dev_t); /* Invalid inode operations -- fs/bad_inode.c */ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) { return file_write_and_wait_range(file, 0, LLONG_MAX); } extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); static inline bool iocb_is_dsync(const struct kiocb *iocb) { return (iocb->ki_flags & IOCB_DSYNC) || IS_SYNC(iocb->ki_filp->f_mapping->host); } /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount * of bytes passed in, or an error if syncing the file failed. */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, iocb->ki_pos - 1); } return count; } extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK extern int bmap(struct inode *inode, sector_t *block); #else static inline int bmap(struct inode *inode, sector_t *block) { return -EINVAL; } #endif int notify_change(struct mnt_idmap *, struct dentry *, struct iattr *, struct inode **); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) { return inode_permission(file_mnt_idmap(file), file_inode(file), mask); } static inline int path_permission(const struct path *path, int mask) { return inode_permission(mnt_idmap(path->mnt), d_inode(path->dentry), mask); } int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) { return (inode->i_mode ^ mode) & S_IFMT; } /** * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * * This is a variant of sb_start_write() which is a noop on non-regualr file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_start_write_trylock(file_inode(file)->i_sb); } /** * file_end_write - drop write access to a superblock of a regular file * @file: the file we wrote to * * Should be matched with a call to file_start_write(). */ static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_end_write(file_inode(file)->i_sb); } /** * kiocb_start_write - get write access to a superblock for async file io * @iocb: the io context we want to submit the write with * * This is a variant of sb_start_write() for async io submission. * Should be matched with a call to kiocb_end_write(). */ static inline void kiocb_start_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); sb_start_write(inode->i_sb); /* * Fool lockdep by telling it the lock got released so that it * doesn't complain about the held lock when we return to userspace. */ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); } /** * kiocb_end_write - drop write access to a superblock after async file io * @iocb: the io context we sumbitted the write with * * Should be matched with a call to kiocb_start_write(). */ static inline void kiocb_end_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); /* * Tell lockdep we inherited freeze protection from submission thread. */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); sb_end_write(inode->i_sb); } /* * This is used for regular files where some users -- especially the * currently executed binary in a process, previously handled via * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap * read-write shared) accesses. * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. * deny_write_access() denies write access to a file. * allow_write_access() re-enables write access to a file. * * The i_writecount field of an inode can have the following values: * 0: no write access, no denied write access * < 0: (-i_writecount) users that denied write access to the file. * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to * use {get,deny}_write_access() - these functions check the sign and refuse * to do the change if sign is wrong. */ static inline int get_write_access(struct inode *inode) { return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline int deny_write_access(struct file *file) { struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); } static inline void allow_write_access(struct file *file) { if (file) atomic_inc(&file_inode(file)->i_writecount); } /* * Do not prevent write to executable file when watched by pre-content events. * * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at * the time of file open and remains constant for entire lifetime of the file, * so if pre-content watches are added post execution or removed before the end * of the execution, it will not cause i_writecount reference leak. */ static inline int exe_file_deny_write_access(struct file *exe_file) { if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return 0; return deny_write_access(exe_file); } static inline void exe_file_allow_write_access(struct file *exe_file) { if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return; allow_write_access(exe_file); } static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) { file->f_mode &= ~FMODE_FSNOTIFY_MASK; file->f_mode |= mode; } static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; } #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { BUG_ON(atomic_dec_return(&inode->i_readcount) < 0); } static inline void i_readcount_inc(struct inode *inode) { atomic_inc(&inode->i_readcount); } #else static inline void i_readcount_dec(struct inode *inode) { return; } static inline void i_readcount_inc(struct inode *inode) { return; } #endif extern int do_pipe_flags(int *, int); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); /** * is_dot_dotdot - returns true only if @name is "." or ".." * @name: file name to check * @len: length of file name, in bytes */ static inline bool is_dot_dotdot(const char *name, size_t len) { return len && unlikely(name[0] == '.') && (len == 1 || (len == 2 && name[1] == '.')); } #include <linux/err.h> /* needed for stackable file system support */ extern loff_t default_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } extern void inode_init_once(struct inode *); extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); struct inode *iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); struct inode *iget5_locked_rcu(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); extern struct inode *find_inode_nowait(struct super_block *, unsigned long, int (*match)(struct inode *, unsigned long, void *), void *data); extern struct inode *find_inode_rcu(struct super_block *, unsigned long, int (*)(struct inode *, void *), void *); extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* * Userspace may rely on the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the * lower 32 bits, so we need to check that those aren't zero explicitly. With * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but * better safe than sorry. */ static inline bool is_zero_ino(ino_t ino) { return (u32)ino == 0; } /* * inode->i_lock must be held */ static inline void __iget(struct inode *inode) { atomic_inc(&inode->i_count); } extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); struct inode *alloc_inode(struct super_block *sb); static inline struct inode *new_inode_pseudo(struct super_block *sb) { return alloc_inode(sb); } extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); /* * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); int generic_write_checks_count(struct kiocb *iocb, loff_t *count); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter); ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); /* fs/splice.c */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof); loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie); extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); int rw_verify_area(int, struct file *, const loff_t *, size_t); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, loff_t file_offset); enum { /* need locking between buffered and direct access */ DIO_LOCKING = 0x01, /* filesystem does not support filling holes */ DIO_SKIP_HOLES = 0x02, }; ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif bool inode_dio_finished(const struct inode *inode); void inode_dio_wait(struct inode *inode); void inode_dio_wait_interruptible(struct inode *inode); /** * inode_dio_begin - signal start of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_begin(struct inode *inode) { atomic_inc(&inode->i_dio_count); } /** * inode_dio_end - signal finish of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_var(&inode->i_dio_count); } extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link_raw(struct dentry *, struct inode *, struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, unsigned int unit_max); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); static inline loff_t __inode_get_bytes(struct inode *inode) { return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; } loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); const char *simple_get_link(struct dentry *, struct inode *, struct delayed_call *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, filename, stat, 0); } static inline int vfs_lstat(const char __user *name, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); extern int simple_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int simple_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); extern int simple_nosetlease(struct file *, int, struct file_lease **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; extern void make_empty_dir_inode(struct inode *inode); extern bool is_empty_dir_inode(struct inode *inode); struct tree_descr { const char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); extern int simple_fill_super(struct super_block *, unsigned long, const struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { struct maple_tree mt; unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); void simple_offset_destroy(struct offset_ctx *octx); extern const struct file_operations simple_offset_dir_operations; extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_sb_d_ops(struct super_block *sb); extern int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len); #if IS_ENABLED(CONFIG_UNICODE) int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /** * generic_ci_validate_strict_name - Check if a given name is suitable * for a directory * * This functions checks if the proposed filename is valid for the * parent directory. That means that only valid UTF-8 filenames will be * accepted for casefold directories from filesystems created with the * strict encoding flag. That also means that any name will be * accepted for directories that doesn't have casefold enabled, or * aren't being strict with the encoding. * * @dir: inode of the directory where the new file will be created * @name: name of the new file * * Return: * * True: if the filename is suitable for this directory. It can be * true if a given name is not suitable for a strict encoding * directory, but the directory being used isn't strict * * False if the filename isn't suitable for this directory. This only * happens when a directory is casefolded and the filesystem is strict * about its encoding. */ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) return true; /* * A casefold dir must have a encoding set, unless the filesystem * is corrupted */ if (WARN_ON_ONCE(!dir->i_sb->s_encoding)) return true; return !utf8_validate(dir->i_sb->s_encoding, name); } #else static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { return true; } #endif static inline bool sb_has_encoding(const struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) return !!sb->s_encoding; #else return false; #endif } int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); void setattr_copy(struct mnt_idmap *, struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } static inline bool vma_is_fsdax(struct vm_area_struct *vma) { struct inode *inode; if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file) return false; if (!vma_is_dax(vma)) return false; inode = file_inode(vma->vm_file); if (S_ISCHR(inode->i_mode)) return false; /* device-dax */ return true; } static inline int iocb_flags(struct file *file) { int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if (file->f_flags & O_DSYNC) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; return res; } static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int rw_type) { int kiocb_flags = 0; /* make sure there's no overlap between RWF and private IOCB flags */ BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD); if (!flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) return -EOPNOTSUPP; if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } if (flags & RWF_DONTCACHE) { /* file system must support it */ if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) return -EOPNOTSUPP; /* DAX mappings not supported */ if (IS_DAX(ki->ki_filp->f_mapping->host)) return -EOPNOTSUPP; } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { if (IS_APPEND(file_inode(ki->ki_filp))) return -EPERM; ki->ki_flags &= ~IOCB_APPEND; } ki->ki_flags |= kiocb_flags; return 0; } /* Transaction based IO helpers */ /* * An argresp is stored in an allocated page and holds the * size of the argument or response, along with its content */ struct simple_transaction_argresp { ssize_t size; char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) char *simple_transaction_get(struct file *file, const char __user *buf, size_t size); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); void simple_transaction_set(struct file *file, size_t n); /* * simple attribute files * * These attributes behave similar to those in sysfs: * * Writing to an attribute immediately sets a value, an open file can be * written to multiple times. * * Reading from an attribute creates a buffer from the value that might get * read with multiple read calls. When the attribute has been read * completely, no further read calls are possible until the file is opened * again. * * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ #define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \ } \ static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) #define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { /* don't do anything, just let the compiler check the arguments; */ } int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt); int simple_attr_release(struct inode *inode, struct file *file); ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; return __check_sticky(idmap, dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC)) inode->i_flags |= S_NOSEC; } static inline bool is_root_inode(struct inode *inode) { return inode == inode->i_sb->s_root->d_inode; } static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { return ctx->actor(ctx, name, namelen, ctx->pos, ino, type); } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, ".", 1, ctx->pos, file->f_path.dentry->d_inode->i_ino, DT_DIR); } static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) return false; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit_dotdot(file, ctx)) return false; ctx->pos = 2; } return true; } static inline bool dir_relax(struct inode *inode) { inode_unlock(inode); inode_lock(inode); return !IS_DEADDIR(inode); } static inline bool dir_relax_shared(struct inode *inode) { inode_unlock_shared(inode); inode_lock_shared(inode); return !IS_DEADDIR(inode); } extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); /* mm/fadvise.c */ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); static inline bool vfs_empty_path(int dfd, const char __user *path) { char c; if (dfd < 0) return false; /* We now allow NULL to be used for empty path. */ if (!path) return true; if (unlikely(get_user(c, path))) return false; return !c; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */
352 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H #ifdef CONFIG_HUGETLB_PAGE #include <linux/mm.h> static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return false; } #endif #endif
159 4 159 154 155 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", current->comm); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", current->comm); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { const struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; struct __user_cap_data_struct kdata[2]; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (ret) return ret; /* * Annoying legacy format with 64-bit capabilities exposed * as two sets of 32-bit fields, so we need to split the * capability values up. */ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32; kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32; kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) return -EFAULT; return 0; } static kernel_cap_t mk_kernel_cap(u32 low, u32 high) { return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK }; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[2] = { { 0, }, }; unsigned tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective); permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted); inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable); new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode) { return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> /* For ARPHRD_xxx */ #include <linux/ip.h> #include <linux/in.h> #include <linux/jhash.h> #include <net/arp.h> #include <net/addrconf.h> #include <net/pkt_sched.h> #include <linux/inetdevice.h> #include <rdma/ib_cache.h> MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct net_device *dev; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static int ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static int ipoib_set_mac(struct net_device *dev, void *addr); static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static int ipoib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *ni = ptr; struct net_device *dev = ni->dev; if (dev->netdev_ops->ndo_open != ipoib_open) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: ipoib_create_debug_files(dev); break; case NETDEV_CHANGENAME: ipoib_delete_debug_files(dev); ipoib_create_debug_files(dev); break; case NETDEV_UNREGISTER: ipoib_delete_debug_files(dev); break; } return NOTIFY_DONE; } #endif int ipoib_open(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "bringing up interface\n"); netif_carrier_off(dev); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; } ipoib_ib_dev_up(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; flags = cpriv->dev->flags; if (flags & IFF_UP) continue; dev_change_flags(cpriv->dev, flags | IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } else if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags)) ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n", ppriv->dev->name); } netif_start_queue(dev); return 0; err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static int ipoib_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); netif_stop_queue(dev); ipoib_ib_dev_down(dev); ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; flags = cpriv->dev->flags; if (!(flags & IFF_UP)) continue; dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } return 0; } static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); return features; } static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = 0; /* dev->mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(dev)) { if (new_mtu > ipoib_cm_max_mtu(dev)) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); WRITE_ONCE(dev->mtu, new_mtu); return 0; } if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; if (priv->mcast_mtu < priv->admin_mtu) ipoib_dbg(priv, "MTU must be smaller than the underlying " "link layer MTU - 4 (%u)\n", priv->mcast_mtu); new_mtu = min(priv->mcast_mtu, priv->admin_mtu); if (priv->rn_ops->ndo_change_mtu) { bool carrier_status = netif_carrier_ok(dev); netif_carrier_off(dev); /* notify lower level on the real mtu */ ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); if (carrier_status) netif_carrier_on(dev); } else { WRITE_ONCE(dev->mtu, new_mtu); } return ret; } static void ipoib_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (priv->rn_ops->ndo_get_stats64) priv->rn_ops->ndo_get_stats64(dev, stats); else netdev_stats_to_stats64(stats, &dev->stats); } /* Called with an RCU read lock taken */ static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, struct net_device *dev) { struct net *net = dev_net(dev); struct in_device *in_dev; struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr; __be32 ret_addr; switch (addr->sa_family) { case AF_INET: in_dev = in_dev_get(dev); if (!in_dev) return false; ret_addr = inet_confirm_addr(net, in_dev, 0, addr_in->sin_addr.s_addr, RT_SCOPE_HOST); in_dev_put(in_dev); if (ret_addr) return true; break; case AF_INET6: if (IS_ENABLED(CONFIG_IPV6) && ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) return true; break; } return false; } /* * Find the master net_device on top of the given net_device. * @dev: base IPoIB net_device * * Returns the master net_device with a reference held, or the same net_device * if no master exists. */ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) { struct net_device *master; rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); dev_hold(master); rcu_read_unlock(); if (master) return master; dev_hold(dev); return dev; } struct ipoib_walk_data { const struct sockaddr *addr; struct net_device *result; }; static int ipoib_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; int ret = 0; if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { dev_hold(upper); data->result = upper; ret = 1; } return ret; } /** * ipoib_get_net_dev_match_addr - Find a net_device matching * the given address, which is an upper device of the given net_device. * * @addr: IP address to look for. * @dev: base IPoIB net_device * * If found, returns the net_device with a reference held. Otherwise return * NULL. */ static struct net_device *ipoib_get_net_dev_match_addr( const struct sockaddr *addr, struct net_device *dev) { struct netdev_nested_priv priv; struct ipoib_walk_data data = { .addr = addr, }; priv.data = (void *)&data; rcu_read_lock(); if (ipoib_is_dev_match_addr_rcu(addr, dev)) { dev_hold(dev); data.result = dev; goto out; } netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); out: rcu_read_unlock(); return data.result; } /* returns the number of IPoIB netdevs on top a given ipoib device matching a * pkey_index and address, if one exists. * * @found_net_dev: contains a matching net_device if the return value >= 1, * with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, int nesting, struct net_device **found_net_dev) { struct ipoib_dev_priv *child_priv; struct net_device *net_dev = NULL; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (!addr) { net_dev = ipoib_get_master_net_dev(priv->dev); } else { /* Verify the net_device matches the IP address, as * IPoIB child devices currently share a GID. */ net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev); } if (net_dev) { if (!*found_net_dev) *found_net_dev = net_dev; else dev_put(net_dev); ++matches; } } /* Check child interfaces */ down_read_nested(&priv->vlan_rwsem, nesting); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, nesting + 1, found_net_dev); if (matches > 1) break; } up_read(&priv->vlan_rwsem); return matches; } /* Returns the number of matching net_devs found (between 0 and 2). Also * return the matching net_device in the @net_dev parameter, holding a * reference to the net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port, u16 pkey_index, const union ib_gid *gid, const struct sockaddr *addr, struct net_device **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, 0, net_dev); if (matches > 1) break; } return matches; } static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { struct net_device *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with L2 parameters only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); fallthrough; case 1: return net_dev; } } int ipoib_set_mode(struct net_device *dev, const char *buf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "connected\n")) || (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "datagram\n"))) { return 0; } /* flush paths if we switch modes so that connections are restarted */ if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); netdev_update_features(dev); dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); netif_set_real_num_tx_queues(dev, 1); rtnl_unlock(); priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } if (!strcmp(buf, "datagram\n")) { clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); netdev_update_features(dev); dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); netif_set_real_num_tx_queues(dev, dev->num_tx_queues); rtnl_unlock(); ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } return -EINVAL; } struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } static void path_free(struct net_device *dev, struct ipoib_path *path) { struct sk_buff *skb; while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); ipoib_dbg(ipoib_priv(dev), "%s\n", __func__); /* remove all neigh connected to this path */ ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (path->ah) ipoib_put_ah(path->ah); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; iter->dev = dev; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec)), path->pathrec.dgid.raw); if (path->ah) path->ah->valid = 0; } spin_unlock_irq(&priv->lock); } static void push_pseudo_header(struct sk_buff *skb, const char *daddr) { struct ipoib_pseudo_header *phdr; phdr = skb_push(skb, sizeof(*phdr)); memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); } void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); wait_for_completion(&path->done); path_free(dev, path); netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); } static void path_rec_completion(int status, struct sa_path_rec *pathrec, unsigned int num_prs, void *path_ptr) { struct ipoib_path *path = path_ptr; struct net_device *dev = path->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ipoib_neigh *neigh, *tn; struct sk_buff_head skqueue; struct sk_buff *skb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->dgid.raw); else ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", status, path->pathrec.dgid.raw); skb_queue_head_init(&skqueue); if (!status) { struct rdma_ah_attr av; if (!ib_init_ah_attr_from_path(priv->ca, priv->port, pathrec, &av, NULL)) { ah = ipoib_create_ah(dev, priv->pd, &av); rdma_destroy_ah_attr(&av); } } spin_lock_irqsave(&priv->lock, flags); if (!IS_ERR_OR_NULL(ah)) { /* * pathrec.dgid is used as the database key from the LLADDR, * it must remain unchanged even if the SA returns a different * GID to use in the AH. */ if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid))) { ipoib_dbg( priv, "%s got PathRec for gid %pI6 while asked for %pI6\n", dev->name, pathrec->dgid.raw, path->pathrec.dgid.raw); memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid)); } path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->sl); while ((skb = __skb_dequeue(&path->queue))) __skb_queue_tail(&skqueue, skb); list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { if (neigh->ah) { WARN_ON(neigh->ah != old_ah); /* * Dropping the ah reference inside * priv->lock is safe here, because we * will hold one more reference from * the original value of path->ah (ie * old_ah). */ ipoib_put_ah(neigh->ah); } kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); continue; } } while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } path->ah->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (IS_ERR_OR_NULL(ah)) ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (old_ah) ipoib_put_ah(old_ah); while ((skb = __skb_dequeue(&skqueue))) { int ret; skb->dev = dev; ret = dev_queue_xmit(skb); if (ret) ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n", __func__, ret); } } static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, void *gid) { path->dev = priv->dev; if (rdma_cap_opa_ah(priv->ca, priv->port)) path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; else path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; } static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof(*path), GFP_ATOMIC); if (!path) return NULL; skb_queue_head_init(&path->queue); INIT_LIST_HEAD(&path->neigh_list); init_path_rec(priv, path, gid); return path; } static int path_rec_start(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "Start path record lookup for %pI6\n", path->pathrec.dgid.raw); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &path->pathrec, IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); path = __path_find(dev, daddr + 4); if (!path) goto out; if (!path->query) path_rec_start(dev, path); out: spin_unlock_irqrestore(&priv->lock, flags); } static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NULL; } /* To avoid race condition, make sure that the * neigh will be added only once. */ if (unlikely(!list_empty(&neigh->list))) { spin_unlock_irqrestore(&priv->lock, flags); return neigh; } path = __path_find(dev, daddr + 4); if (!path) { path = path_rec_create(dev, daddr + 4); if (!path) goto err_path; __path_add(dev, path); } list_add_tail(&neigh->list, &path->neigh_list); if (path->ah && path->ah->valid) { kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); goto err_drop; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { ipoib_warn(priv, "queue length limit %d. Packet drop.\n", skb_queue_len(&neigh->queue)); goto err_drop; } } else { spin_unlock_irqrestore(&priv->lock, flags); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(daddr)); ipoib_neigh_put(neigh); return NULL; } } else { neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) goto err_path; if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { goto err_drop; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; err_path: ipoib_neigh_free(neigh); err_drop: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, struct ipoib_pseudo_header *phdr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); /* no broadcast means that all paths are (going to be) not valid */ if (!priv->broadcast) goto drop_and_unlock; path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->ah || !path->ah->valid) { if (!path) { path = path_rec_create(dev, phdr->hwaddr + 4); if (!path) goto drop_and_unlock; __path_add(dev, path); } else { /* * make sure there are no changes in the existing * path record */ init_path_rec(priv, path, phdr->hwaddr + 4); } if (!path->query && path_rec_start(dev, path)) { goto drop_and_unlock; } if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); goto unlock; } else { goto drop_and_unlock; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_dbg(priv, "Send unicast ARP to %08x\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec))); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(phdr->hwaddr)); return; drop_and_unlock: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); unlock: spin_unlock_irqrestore(&priv->lock, flags); } static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_neigh *neigh; struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags; phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr)); header = (struct ipoib_header *) skb->data; if (unlikely(phdr->hwaddr[4] == 0xff)) { /* multicast, arrange "if" according to probability */ if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_ARP)) && (header->proto != htons(ETH_P_RARP)) && (header->proto != htons(ETH_P_TIPC))) { /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* Add in the P_Key for multicast*/ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; phdr->hwaddr[9] = priv->pkey & 0xff; neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (likely(neigh)) goto send_using_neigh; ipoib_mcast_send(dev, phdr->hwaddr, skb); return NETDEV_TX_OK; } /* unicast, arrange "switch" according to probability */ switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (unlikely(!neigh)) { neigh = neigh_add_path(skb, phdr->hwaddr, dev); if (likely(!neigh)) return NETDEV_TX_OK; } break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): /* for unicast ARP and RARP should always perform path find */ unicast_arp_send(skb, dev, phdr); return NETDEV_TX_OK; default: /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } send_using_neigh: /* note we now hold a ref to neigh */ if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } } else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr)); goto unref; } else if (neigh->ah) { neigh_refresh_path(neigh, phdr->hwaddr, dev); } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } unref: ipoib_neigh_put(neigh); return NETDEV_TX_OK; } static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); if (rn->tx_timeout) { rn->tx_timeout(dev, txqueue); return; } ipoib_warn(priv, "transmit timeout: latency %d msecs\n", jiffies_to_msecs(jiffies - dev_trans_start(dev))); ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n", netif_queue_stopped(dev), priv->tx_head, priv->tx_tail, priv->global_tx_head, priv->global_tx_tail); schedule_work(&priv->tx_timeout_work); } void ipoib_ib_tx_timeout_work(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, tx_timeout_work); int err; rtnl_lock(); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) goto unlock; ipoib_stop(priv->dev); err = ipoib_open(priv->dev); if (err) { ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n", err); goto unlock; } netif_tx_wake_all_queues(priv->dev); unlock: rtnl_unlock(); } static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ipoib_header *header; header = skb_push(skb, sizeof(*header)); header->proto = htons(type); header->reserved = 0; /* * we don't rely on dst_entry structure, always stuff the * destination address into skb hard header so we can figure out where * to send the packet later. */ push_pseudo_header(skb, daddr); return IPOIB_HARD_LEN; } static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); return; } queue_work(priv->wq, &priv->restart_task); } static int ipoib_get_iflink(const struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); /* parent interface */ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) return READ_ONCE(dev->ifindex); /* child/vlan interface */ return READ_ONCE(priv->parent->ifindex); } static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { /* * Use only the address parts that contributes to spreading * The subnet prefix is not used as one can not connect to * same remote port (GUID) using the same remote QPN via two * different subnets. */ /* qpn octets[1:4) & port GUID octets[12:20) */ u32 *d32 = (u32 *) daddr; u32 hv; hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); return hv & htbl->mask; } struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh = NULL; u32 hash_val; rcu_read_lock_bh(); htbl = rcu_dereference_bh(ntbl->htbl); if (!htbl) goto out_unlock; hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); neigh != NULL; neigh = rcu_dereference_bh(neigh->hnext)) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; goto out_unlock; } if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) neigh->alive = jiffies; goto out_unlock; } } out_unlock: rcu_read_unlock_bh(); return neigh; } static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long neigh_obsolete; unsigned long dt; unsigned long flags; int i; LIST_HEAD(remove_list); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; /* neigh is obsolete if it was idle for two GC periods */ dt = 2 * arp_tbl.gc_interval; neigh_obsolete = jiffies - dt; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) { ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list); rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); ipoib_mcast_remove_list(&remove_list); } static void ipoib_reap_neigh(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); __ipoib_reap_neigh(priv); queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct net_device *dev) { struct ipoib_neigh *neigh; neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC); if (!neigh) return NULL; neigh->dev = dev; memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); skb_queue_head_init(&neigh->queue); INIT_LIST_HEAD(&neigh->list); ipoib_cm_set(neigh, NULL); /* one ref on behalf of the caller */ refcount_set(&neigh->refcnt, 1); return neigh; } struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) { neigh = NULL; goto out_unlock; } /* need to add a new neigh, but maybe some other thread succeeded? * recalc hash, maybe hash resize took place so we do a search */ hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock)); neigh != NULL; neigh = rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; break; } neigh->alive = jiffies; goto out_unlock; } } neigh = ipoib_neigh_ctor(daddr, dev); if (!neigh) goto out_unlock; /* one ref on behalf of the hash table */ refcount_inc(&neigh->refcnt); neigh->alive = jiffies; /* put in hash */ rcu_assign_pointer(neigh->hnext, rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock))); rcu_assign_pointer(htbl->buckets[hash_val], neigh); atomic_inc(&ntbl->entries); out_unlock: return neigh; } void ipoib_neigh_dtor(struct ipoib_neigh *neigh) { /* neigh reference count was dropprd to zero */ struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; if (neigh->ah) ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); ipoib_dbg(ipoib_priv(dev), "neigh free for %06x %pI6\n", IPOIB_QPN(neigh->daddr), neigh->daddr + 4); kfree(neigh); if (atomic_dec_and_test(&priv->ntbl.entries)) { if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) complete(&priv->ntbl.flushed); } } static void ipoib_neigh_reclaim(struct rcu_head *rp) { /* Called as a result of removal from hash table */ struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); /* note TX context may hold another ref */ ipoib_neigh_put(neigh); } void ipoib_neigh_free(struct ipoib_neigh *neigh) { struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **np; struct ipoib_neigh *n; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) return; hash_val = ipoib_addr_hash(htbl, neigh->daddr); np = &htbl->buckets[hash_val]; for (n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock)); n != NULL; n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) { if (n == neigh) { /* found */ rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { np = &n->hnext; } } } static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **buckets; u32 size; clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); ntbl->htbl = NULL; htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); if (!htbl) return -ENOMEM; size = roundup_pow_of_two(arp_tbl.gc_thresh3); buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL); if (!buckets) { kfree(htbl); return -ENOMEM; } htbl->size = size; htbl->mask = (size - 1); htbl->buckets = buckets; RCU_INIT_POINTER(ntbl->htbl, htbl); htbl->ntbl = ntbl; atomic_set(&ntbl->entries, 0); /* start garbage collection */ queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct ipoib_neigh_hash *htbl = container_of(head, struct ipoib_neigh_hash, rcu); struct ipoib_neigh __rcu **buckets = htbl->buckets; struct ipoib_neigh_table *ntbl = htbl->ntbl; kvfree(buckets); kfree(htbl); complete(&ntbl->deleted); } void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i; /* remove all neigh connected to a given path or mcast */ spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* delete neighs belong to this parent */ if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i, wait_flushed = 0; init_completion(&priv->ntbl.flushed); set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; wait_flushed = atomic_read(&priv->ntbl.entries); if (!wait_flushed) goto free_htbl; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } free_htbl: rcu_assign_pointer(ntbl->htbl, NULL); call_rcu(&htbl->rcu, neigh_hash_free_rcu); out_unlock: spin_unlock_irqrestore(&priv->lock, flags); if (wait_flushed) wait_for_completion(&priv->ntbl.flushed); } static void ipoib_neigh_hash_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "%s\n", __func__); init_completion(&priv->ntbl.deleted); cancel_delayed_work_sync(&priv->neigh_reap_task); ipoib_flush_neighs(priv); wait_for_completion(&priv->ntbl.deleted); } static void ipoib_napi_add(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); } static void ipoib_napi_del(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_del(&priv->recv_napi); netif_napi_del(&priv->send_napi); } static void ipoib_dev_uninit_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_transport_dev_cleanup(dev); ipoib_napi_del(dev); ipoib_cm_dev_cleanup(dev); kfree(priv->rx_ring); vfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static int ipoib_dev_init_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); u8 addr_mod[3]; ipoib_napi_add(dev); /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kcalloc(ipoib_recvq_size, sizeof(*priv->rx_ring), GFP_KERNEL); if (!priv->rx_ring) goto out; priv->tx_ring = vzalloc(array_size(ipoib_sendq_size, sizeof(*priv->tx_ring))); if (!priv->tx_ring) { pr_warn("%s: failed to allocate TX ring (%d entries)\n", priv->ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */ if (ipoib_transport_dev_init(dev, priv->ca)) { pr_warn("%s: ipoib_transport_dev_init failed\n", priv->ca->name); goto out_tx_ring_cleanup; } /* after qp created set dev address */ addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff; addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff; addr_mod[2] = (priv->qp->qp_num) & 0xff; dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod)); return 0; out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: ipoib_napi_del(dev); return -ENOMEM; } static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!priv->rn_ops->ndo_eth_ioctl) return -EOPNOTSUPP; return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd); } static int ipoib_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = -ENOMEM; priv->qp = NULL; /* * the various IPoIB tasks assume they will never race against * themselves, so always use a single thread workqueue */ priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); if (!priv->wq) { pr_warn("%s: failed to allocate device WQ\n", dev->name); goto out; } /* create pd, which used both for control and datapath*/ priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { pr_warn("%s: failed to allocate PD\n", priv->ca->name); goto clean_wq; } ret = priv->rn_ops->ndo_init(dev); if (ret) { pr_warn("%s failed to init HW resource\n", dev->name); goto out_free_pd; } ret = ipoib_neigh_hash_init(priv); if (ret) { pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit; } if (dev->flags & IFF_UP) { if (ipoib_ib_dev_open(dev)) { pr_warn("%s failed to open device\n", dev->name); ret = -ENODEV; goto out_hash_uninit; } } return 0; out_hash_uninit: ipoib_neigh_hash_uninit(dev); out_dev_uninit: ipoib_ib_dev_cleanup(dev); out_free_pd: if (priv->pd) { ib_dealloc_pd(priv->pd); priv->pd = NULL; } clean_wq: if (priv->wq) { destroy_workqueue(priv->wq); priv->wq = NULL; } out: return ret; } /* * This must be called before doing an unregister_netdev on a parent device to * shutdown the IB event handler. */ static void ipoib_parent_unregister_pre(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); /* * ipoib_set_mac checks netif_running before pushing work, clearing * running ensures the it will not add more work. */ rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); rtnl_unlock(); /* ipoib_event() cannot be running once this returns */ ib_unregister_event_handler(&priv->event_handler); /* * Work on the queue grabs the rtnl lock, so this cannot be done while * also holding it. */ flush_workqueue(ipoib_workqueue); } static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) { priv->hca_caps = priv->ca->attrs.device_cap_flags; priv->kernel_caps = priv->ca->attrs.kernel_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; if (priv->kernel_caps & IBK_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; priv->dev->features |= priv->dev->hw_features; } } static int ipoib_parent_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ib_port_attr attr; int result; result = ib_query_port(priv->ca, priv->port, &attr); if (result) { pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, priv->port); return result; } priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr); result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); if (result) { pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); if (result) { pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid)); SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); priv->dev->dev_port = priv->port - 1; /* Let's set this one too for backwards compatibility. */ priv->dev->dev_id = priv->port - 1; return 0; } static void ipoib_child_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); priv->max_ib_mtu = ppriv->max_ib_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN)) memcpy(&priv->local_gid, priv->dev->dev_addr + 4, sizeof(priv->local_gid)); else { __dev_addr_set(priv->dev, ppriv->dev->dev_addr, INFINIBAND_ALEN); memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); } } static int ipoib_ndo_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); int rc; struct rdma_netdev *rn = netdev_priv(ndev); if (priv->parent) { ipoib_child_init(ndev); } else { rc = ipoib_parent_init(ndev); if (rc) return rc; } /* MTU will be reset when mcast join happens */ ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = ndev->mtu; rn->mtu = priv->mcast_mtu; ndev->max_mtu = IPOIB_CM_MTU; ndev->neigh_priv_len = sizeof(struct ipoib_neigh); /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; ndev->broadcast[8] = priv->pkey >> 8; ndev->broadcast[9] = priv->pkey & 0xff; set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); ipoib_set_dev_features(priv); rc = ipoib_dev_init(ndev); if (rc) { pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", priv->ca->name, priv->dev->name, priv->port, rc); return rc; } if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); dev_hold(priv->parent); down_write(&ppriv->vlan_rwsem); list_add_tail(&priv->list, &ppriv->child_intfs); up_write(&ppriv->vlan_rwsem); } return 0; } static void ipoib_ndo_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ASSERT_RTNL(); /* * ipoib_remove_one guarantees the children are removed before the * parent, and that is the only place where a parent can be removed. */ WARN_ON(!list_empty(&priv->child_intfs)); if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); down_write(&ppriv->vlan_rwsem); list_del(&priv->list); up_write(&ppriv->vlan_rwsem); } ipoib_neigh_hash_uninit(dev); ipoib_ib_dev_cleanup(dev); /* no more works over the priv->wq */ if (priv->wq) { /* See ipoib_mcast_carrier_on_task() */ WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)); destroy_workqueue(priv->wq); priv->wq = NULL; } dev_put(priv->parent); } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); } static int ipoib_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int err; err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); if (err) return err; ivf->vf = vf; memcpy(ivf->mac, dev->dev_addr, dev->addr_len); return 0; } static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) return -EINVAL; return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); } static int ipoib_get_vf_guid(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid); } static int ipoib_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); } static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_set_vf_link_state = ipoib_set_vf_link_state, .ndo_get_vf_config = ipoib_get_vf_config, .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_get_vf_guid = ipoib_get_vf_guid, .ndo_set_vf_guid = ipoib_set_vf_guid, .ndo_set_mac_address = ipoib_set_mac, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, }; static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, }; static const struct net_device_ops ipoib_netdev_default_pf = { .ndo_init = ipoib_dev_init_default, .ndo_uninit = ipoib_dev_uninit_default, .ndo_open = ipoib_ib_dev_open_default, .ndo_stop = ipoib_ib_dev_stop_default, }; void ipoib_setup_common(struct net_device *dev) { dev->header_ops = &ipoib_header_ops; dev->netdev_ops = &ipoib_netdev_default_pf; ipoib_set_ethtool_ops(dev); dev->watchdog_timeo = 10 * HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; dev->hard_header_len = IPOIB_HARD_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); /* * unregister_netdev always frees the netdev, we use this mode * consistently to unify all the various unregister paths, including * those connected to rtnl_link_ops which require it. */ dev->needs_free_netdev = true; } static void ipoib_build_priv(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); priv->dev = dev; spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); mutex_init(&priv->mcast_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP) return dev; dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!dev) return ERR_PTR(-ENOMEM); return dev; } int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, struct net_device *dev) { struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_dev_priv *priv; int rc; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; priv->ca = hca; priv->port = port; rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common, dev); if (rc) { if (rc != -EOPNOTSUPP) goto out; rn->send = ipoib_send; rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; rn->hca = hca; rc = netif_set_real_num_tx_queues(dev, 1); if (rc) goto out; rc = netif_set_real_num_rx_queues(dev, 1); if (rc) goto out; } priv->rn_ops = dev->netdev_ops; if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION) dev->netdev_ops = &ipoib_netdev_ops_vf; else dev->netdev_ops = &ipoib_netdev_ops_pf; rn->clnt_priv = priv; /* * Only the child register_netdev flows can handle priv_destructor * being set, so we force it to NULL here and handle manually until it * is safe to turn on. */ priv->next_priv_destructor = dev->priv_destructor; dev->priv_destructor = NULL; ipoib_build_priv(dev); return 0; out: kfree(priv); return rc; } struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; int rc; dev = ipoib_alloc_netdev(hca, port, name); if (IS_ERR(dev)) return dev; rc = ipoib_intf_init(hca, port, name, dev); if (rc) { free_netdev(dev); return ERR_PTR(rc); } /* * Upon success the caller must ensure ipoib_intf_free is called or * register_netdevice succeed'd and priv_destructor is set to * ipoib_intf_free. */ return dev; } void ipoib_intf_free(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); dev->priv_destructor = priv->next_priv_destructor; if (dev->priv_destructor) dev->priv_destructor(dev); /* * There are some error flows around register_netdev failing that may * attempt to call priv_destructor twice, prevent that from happening. */ dev->priv_destructor = NULL; /* unregister/destroy is very complicated. Make bugs more obvious. */ rn->clnt_priv = NULL; kfree(priv); } static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "0x%04x\n", priv->pkey); } static DEVICE_ATTR_RO(pkey); static ssize_t umcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); ipoib_warn(priv, "ignoring multicast groups joined directly " "by userspace\n"); } else clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); } static ssize_t umcast_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long umcast_val = simple_strtoul(buf, NULL, 0); ipoib_set_umcast(to_net_dev(dev), umcast_val); return count; } static DEVICE_ATTR_RW(umcast); int ipoib_add_umcast_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_umcast); } static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) { struct ipoib_dev_priv *child_priv; struct net_device *netdev = priv->dev; netif_addr_lock_bh(netdev); memcpy(&priv->local_gid.global.interface_id, &gid->global.interface_id, sizeof(gid->global.interface_id)); dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid)); clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); netif_addr_unlock_bh(netdev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { down_read(&priv->vlan_rwsem); list_for_each_entry(child_priv, &priv->child_intfs, list) set_base_guid(child_priv, gid); up_read(&priv->vlan_rwsem); } } static int ipoib_check_lladdr(struct net_device *dev, struct sockaddr_storage *ss) { union ib_gid *gid = (union ib_gid *)(ss->__data + 4); int ret = 0; netif_addr_lock_bh(dev); /* Make sure the QPN, reserved and subnet prefix match the current * lladdr, it also makes sure the lladdr is unicast. */ if (memcmp(dev->dev_addr, ss->__data, 4 + sizeof(gid->global.subnet_prefix)) || gid->global.interface_id == 0) ret = -EINVAL; netif_addr_unlock_bh(dev); return ret; } static int ipoib_set_mac(struct net_device *dev, void *addr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sockaddr_storage *ss = addr; int ret; if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) return -EBUSY; ret = ipoib_check_lladdr(dev, ss); if (ret) return ret; set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static ssize_t create_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(create_child); static ssize_t delete_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey < 0 || pkey > 0xffff) return -EINVAL; ret = ipoib_vlan_delete(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_pkey); } /* * We erroneously exposed the iface's port number in the dev_id * sysfs field long after dev_port was introduced for that purpose[1], * and we need to stop everyone from relying on that. * Let's overload the shower routine for the dev_id file here * to gently bring the issue up. * * [1] https://www.spinics.net/lists/netdev/msg272123.html */ static ssize_t dev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); /* * ndev->dev_port will be equal to 0 in old kernel prior to commit * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface * port numbers") Zero was chosen as special case for user space * applications to fallback and query dev_id to check if it has * different value or not. * * Don't print warning in such scenario. * * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358 */ if (ndev->dev_port && ndev->dev_id == ndev->dev_port) netdev_info_once(ndev, "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", current->comm); return sysfs_emit(buf, "%#x\n", ndev->dev_id); } static DEVICE_ATTR_RO(dev_id); static int ipoib_intercept_dev_id_attr(struct net_device *dev) { device_remove_file(&dev->dev, &dev_attr_dev_id); return device_create_file(&dev->dev, &dev_attr_dev_id); } static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u32 port) { struct rtnl_link_ops *ops = ipoib_get_link_ops(); struct rdma_netdev_alloc_params params; struct ipoib_dev_priv *priv; struct net_device *ndev; int result; ndev = ipoib_intf_alloc(hca, port, format); if (IS_ERR(ndev)) { pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port, PTR_ERR(ndev)); return ndev; } priv = ipoib_priv(ndev); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); ib_register_event_handler(&priv->event_handler); /* call event handler to ensure pkey in sync */ queue_work(ipoib_workqueue, &priv->flush_heavy); ndev->rtnl_link_ops = ipoib_get_link_ops(); result = register_netdev(ndev); if (result) { pr_warn("%s: couldn't register ipoib port %d; error %d\n", hca->name, port, result); ipoib_parent_unregister_pre(ndev); ipoib_intf_free(ndev); free_netdev(ndev); return ERR_PTR(result); } if (hca->ops.rdma_netdev_get_params) { int rc = hca->ops.rdma_netdev_get_params(hca, port, RDMA_NETDEV_IPOIB, &params); if (!rc && ops->priv_size < params.sizeof_priv) ops->priv_size = params.sizeof_priv; } /* * We cannot set priv_destructor before register_netdev because we * need priv to be always valid during the error flow to execute * ipoib_parent_unregister_pre(). Instead handle it manually and only * enter priv_destructor mode once we are completely registered. */ ndev->priv_destructor = ipoib_intf_free; if (ipoib_intercept_dev_id_attr(ndev)) goto sysfs_failed; if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_create_child)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_delete_child)) goto sysfs_failed; return ndev; sysfs_failed: ipoib_parent_unregister_pre(ndev); unregister_netdev(ndev); return ERR_PTR(-ENOMEM); } static int ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct net_device *dev; struct ipoib_dev_priv *priv; unsigned int p; int count = 0; dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); if (!dev_list) return -ENOMEM; INIT_LIST_HEAD(dev_list); rdma_for_each_port (device, p) { if (!rdma_protocol_ib(device, p)) continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = ipoib_priv(dev); list_add_tail(&priv->list, dev_list); count++; } } if (!count) { kfree(dev_list); return -EOPNOTSUPP; } ib_set_client_data(device, &ipoib_client, dev_list); return 0; } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; struct list_head *dev_list = client_data; list_for_each_entry_safe(priv, tmp, dev_list, list) { LIST_HEAD(head); ipoib_parent_unregister_pre(priv->dev); rtnl_lock(); list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) unregister_netdevice_queue(cpriv->dev, &head); unregister_netdevice_queue(priv->dev, &head); unregister_netdevice_many(&head); rtnl_unlock(); } kfree(dev_list); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static struct notifier_block ipoib_netdev_notifier = { .notifier_call = ipoib_netdev_event, }; #endif static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); #endif /* * When copying small received packets, we only copy from the * linear data part of the SKB, so we rely on this condition. */ BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); ipoib_register_debugfs(); /* * We create a global workqueue here that is used for all flush * operations. However, if you attempt to flush a workqueue * from a task on that same workqueue, it deadlocks the system. * We want to be able to flush the tasks associated with a * specific net device, so we also create a workqueue for each * netdevice. We queue up the tasks for that device only on * its private workqueue, and we only queue up flush events * on our global flush workqueue. This avoids the deadlocks. */ ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; ret = ipoib_netlink_init(); if (ret) goto err_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG register_netdevice_notifier(&ipoib_netdev_notifier); #endif return 0; err_client: ib_unregister_client(&ipoib_client); err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: ipoib_unregister_debugfs(); return ret; } static void __exit ipoib_cleanup_module(void) { #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG unregister_netdevice_notifier(&ipoib_netdev_notifier); #endif ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module);
220 213 221 221 168 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 /* * memfd_create system call and file sealing support * * Code was originally included in shmem.c, and broken out to facilitate * use by hugetlbfs as well as tmpfs. * * This file is released under the GPL. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/khugepaged.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/memfd.h> #include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> /* * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ static bool memfd_folio_has_extra_refs(struct folio *folio) { return folio_ref_count(folio) - folio_mapcount(folio) != folio_nr_pages(folio); } static void memfd_tag_pins(struct xa_state *xas) { struct folio *folio; int latency = 0; lru_add_drain(); xas_lock_irq(xas); xas_for_each(xas, folio, ULONG_MAX) { if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) xas_set_mark(xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(xas); xas_unlock_irq(xas); cond_resched(); xas_lock_irq(xas); } xas_unlock_irq(xas); } /* * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). * It is mainly called to allocate a folio in a memfd when the caller * (memfd_pin_folios()) cannot find a folio in the page cache at a given * index in the mapping. */ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { #ifdef CONFIG_HUGETLB_PAGE struct folio *folio; gfp_t gfp_mask; int err; if (is_file_hugepages(memfd)) { /* * The folio would most likely be accessed by a DMA driver, * therefore, we have zone memory constraints where we can * alloc from. Also, the folio will be pinned for an indefinite * amount of time, so it is not expected to be migrated away. */ struct hstate *h = hstate_file(memfd); gfp_mask = htlb_alloc_mask(h); gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); folio = alloc_hugetlb_folio_reserve(h, numa_node_id(), NULL, gfp_mask); if (folio) { err = hugetlb_add_to_page_cache(folio, memfd->f_mapping, idx); if (err) { folio_put(folio); return ERR_PTR(err); } folio_unlock(folio); return folio; } return ERR_PTR(-ENOMEM); } #endif return shmem_read_folio(memfd->f_mapping, idx); } /* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references * to those folios to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); struct folio *folio; int error, scan; memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { int latency = 0; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) lru_add_drain_all(); else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; xas_set(&xas, 0); xas_lock_irq(&xas); xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found folios pinned. */ if (scan == LAST_SCAN) error = -EBUSY; else clear = false; } if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } return error; } static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; #ifdef CONFIG_HUGETLBFS if (is_file_hugepages(file)) return &HUGETLBFS_I(file_inode(file))->seals; #endif return NULL; } #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE | \ F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; int error; /* * SEALING * Sealing allows multiple parties to share a tmpfs or hugetlbfs file * but restrict access to a specific subset of file operations. Seals * can only be added, but never removed. This way, mutually untrusted * parties can share common memory regions with a well-defined policy. * A malicious peer can thus never perform unwanted operations on a * shared object. * * Seals are only supported on special tmpfs or hugetlbfs files and * always affect the whole underlying inode. Once a seal is set, it * may prevent some kinds of access to the file. Currently, the * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file * SEAL_EXEC: Prevent modification of the exec bits in the file mode * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file * only adds a given set of seals to the file, it never touches * existing seals. Furthermore, the "setting seals"-operation can be * sealed itself, which basically prevents any further seal from being * added. * * Semantics of sealing are only defined on volatile files. Only * anonymous tmpfs and hugetlbfs files support sealing. More * importantly, seals are never written to disk. Therefore, there's * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; inode_lock(inode); file_seals = memfd_file_seals_ptr(file); if (!file_seals) { error = -EINVAL; goto unlock; } if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; } } /* * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. */ if (seals & F_SEAL_EXEC && inode->i_mode & 0111) seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; } static int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); return seals ? *seals : -EINVAL; } long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { long error; switch (cmd) { case F_ADD_SEALS: error = memfd_add_seals(file, arg); break; case F_GET_SEALS: error = memfd_get_seals(file); break; default: error = -EINVAL; break; } return error; } #define MFD_NAME_PREFIX "memfd:" #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL struct pid_namespace *ns = task_active_pid_ns(current); int sysctl = pidns_memfd_noexec_scope(ns); if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) *flags |= MFD_NOEXEC_SEAL; else *flags |= MFD_EXEC; } if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { pr_err_ratelimited( "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", current->comm, task_pid_nr(current), sysctl); return -EACCES; } #endif return 0; } static inline bool is_write_sealed(unsigned int seals) { return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); } static int check_write_seal(unsigned long *vm_flags_ptr) { unsigned long vm_flags = *vm_flags_ptr; unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) return 0; /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when * write seals are active. */ if (mask & VM_WRITE) return -EPERM; /* * This is a read-only mapping, disallow mprotect() from making a * write-sealed mapping writable in future. */ *vm_flags_ptr &= ~VM_MAYWRITE; return 0; } int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); unsigned int seals = seals_ptr ? *seals_ptr : 0; if (is_write_sealed(seals)) err = check_write_seal(vm_flags_ptr); return err; } static int sanitize_flags(unsigned int *flags_ptr) { unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~(unsigned int)MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ if (flags & ~(unsigned int)(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; return check_sysctl_memfd_noexec(flags_ptr); } static char *alloc_name(const char __user *uname) { int error; char *name; long len; name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) return ERR_PTR(-ENOMEM); strcpy(name, MFD_NAME_PREFIX); /* returned length does not include terminating zero */ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); if (len < 0) { error = -EFAULT; goto err_name; } else if (len > MFD_NAME_MAX_LEN) { error = -EINVAL; goto err_name; } return name; err_name: kfree(name); return ERR_PTR(error); } static struct file *alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); } else { file = shmem_file_setup(name, 0, VM_NORESERVE); } if (IS_ERR(file)) return file; file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { struct inode *inode = file_inode(file); inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { *file_seals &= ~F_SEAL_SEAL; *file_seals |= F_SEAL_EXEC; } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); if (file_seals) *file_seals &= ~F_SEAL_SEAL; } return file; } SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { struct file *file; int fd, error; char *name; error = sanitize_flags(&flags); if (error < 0) return error; name = alloc_name(uname); if (IS_ERR(name)) return PTR_ERR(name); fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); if (fd < 0) { error = fd; goto err_name; } file = alloc_file(name, flags); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; } fd_install(fd, file); kfree(name); return fd; err_fd: put_unused_fd(fd); err_name: kfree(name); return error; }
792 791 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 // SPDX-License-Identifier: GPL-2.0 #include <linux/fault-inject.h> #include <linux/debugfs.h> #include <linux/error-injection.h> #include <linux/mm.h> static struct { struct fault_attr attr; bool ignore_gfp_highmem; bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; static int __init setup_fail_page_alloc(char *str) { return setup_fault_attr(&fail_page_alloc.attr, str); } __setup("fail_page_alloc=", setup_fail_page_alloc); bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { int flags = 0; if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; if (fail_page_alloc.ignore_gfp_reclaim && (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) flags |= FAULT_NOWARN; return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_page_alloc_debugfs(void) { umode_t mode = S_IFREG | 0600; struct dentry *dir; dir = fault_create_debugfs_attr("fail_page_alloc", NULL, &fail_page_alloc.attr); debugfs_create_bool("ignore-gfp-wait", mode, dir, &fail_page_alloc.ignore_gfp_reclaim); debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem); debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); return 0; } late_initcall(fail_page_alloc_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1040 1040 1073 1074 1073 1074 1038 1065 1069 1066 1068 1066 90 1018 1022 1015 1017 1019 59 1068 1077 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 // SPDX-License-Identifier: GPL-2.0 /* * Lockless hierarchical page accounting & limiting * * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner */ #include <linux/page_counter.h> #include <linux/atomic.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/bug.h> #include <asm/page.h> static bool track_protection(struct page_counter *c) { return c->protection_support; } static void propagate_protected_usage(struct page_counter *c, unsigned long usage) { unsigned long protected, old_protected; long delta; if (!c->parent) return; protected = min(usage, READ_ONCE(c->min)); old_protected = atomic_long_read(&c->min_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->min_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_min_usage); } protected = min(usage, READ_ONCE(c->low)); old_protected = atomic_long_read(&c->low_usage); if (protected != old_protected) { old_protected = atomic_long_xchg(&c->low_usage, protected); delta = protected - old_protected; if (delta) atomic_long_add(delta, &c->parent->children_low_usage); } } /** * page_counter_cancel - take pages out of the local counter * @counter: counter * @nr_pages: number of pages to cancel */ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", new, nr_pages)) { new = 0; atomic_long_set(&counter->usage, new); } if (track_protection(counter)) propagate_protected_usage(counter, new); } /** * page_counter_charge - hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * * NOTE: This does not consider any configured counter limits. */ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; bool protection = track_protection(counter); for (c = counter; c; c = c->parent) { long new; new = atomic_long_add_return(nr_pages, &c->usage); if (protection) propagate_protected_usage(c, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. * * Notably, we have two watermarks to allow for both a globally * visible peak and one that can be reset at a smaller scope. * * Since we reset both watermarks when the global reset occurs, * we can guarantee that watermark >= local_watermark, so we * don't need to do both comparisons every time. * * On systems with branch predictors, the inner condition should * be almost free. */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } } /** * page_counter_try_charge - try to hierarchically charge pages * @counter: counter * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * * Returns %true on success, or %false and @fail if the counter or one * of its ancestors has hit its configured limit. */ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail) { struct page_counter *c; bool protection = track_protection(counter); bool track_failcnt = counter->track_failcnt; for (c = counter; c; c = c->parent) { long new; /* * Charge speculatively to avoid an expensive CAS. If * a bigger charge fails, it might falsely lock out a * racing smaller charge and send it into reclaim * early, but the error is limited to the difference * between the two sizes, which is less than 2M/4M in * case of a THP locking out a regular page charge. * * The atomic_long_add_return() implies a full memory * barrier between incrementing the count and reading * the limit. When racing with page_counter_set_max(), * we either see the new limit or the setter sees the * counter has changed and retries. */ new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used * to report stats. */ if (track_failcnt) data_race(c->failcnt++); *fail = c; goto failed; } if (protection) propagate_protected_usage(c, new); /* see comment on page_counter_charge */ if (new > READ_ONCE(c->local_watermark)) { WRITE_ONCE(c->local_watermark, new); if (new > READ_ONCE(c->watermark)) WRITE_ONCE(c->watermark, new); } } return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); return false; } /** * page_counter_uncharge - hierarchically uncharge pages * @counter: counter * @nr_pages: number of pages to uncharge */ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; for (c = counter; c; c = c->parent) page_counter_cancel(c, nr_pages); } /** * page_counter_set_max - set the maximum number of pages allowed * @counter: counter * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; long usage; /* * Update the limit while making sure that it's not * below the concurrently-changing counter value. * * The xchg implies two full memory barriers before * and after, so the read-swap-read is ordered and * ensures coherency with page_counter_try_charge(): * that function modifies the count before checking * the limit, so if it sees the old limit, we see the * modified counter and retry. */ usage = page_counter_read(counter); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; cond_resched(); } } /** * page_counter_set_min - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_set_low - set the amount of protected memory * @counter: counter * @nr_pages: value to set * * The caller must serialize invocations on the same counter. */ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; WRITE_ONCE(counter->low, nr_pages); for (c = counter; c; c = c->parent) propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse * @max: string meaning maximum possible value * @nr_pages: returns the result in number of pages * * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be * limited to %PAGE_COUNTER_MAX. */ int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages) { char *end; u64 bytes; if (!strcmp(buf, max)) { *nr_pages = PAGE_COUNTER_MAX; return 0; } bytes = memparse(buf, &end); if (*end != '\0') return -EINVAL; *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); return 0; } #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) /* * This function calculates an individual page counter's effective * protection which is derived from its own memory.min/low, its * parent's and siblings' settings, as well as the actual memory * distribution in the tree. * * The following rules apply to the effective protection values: * * 1. At the first level of reclaim, effective protection is equal to * the declared protection in memory.min and memory.low. * * 2. To enable safe delegation of the protection configuration, at * subsequent levels the effective protection is capped to the * parent's effective protection. * * 3. To make complex and dynamic subtrees easier to configure, the * user is allowed to overcommit the declared protection at a given * level. If that is the case, the parent's effective protection is * distributed to the children in proportion to how much protection * they have declared and how much of it they are utilizing. * * This makes distribution proportional, but also work-conserving: * if one counter claims much more protection than it uses memory, * the unused remainder is available to its siblings. * * 4. Conversely, when the declared protection is undercommitted at a * given level, the distribution of the larger parental protection * budget is NOT proportional. A counter's protection from a sibling * is capped to its own memory.min/low setting. * * 5. However, to allow protecting recursive subtrees from each other * without having to declare each individual counter's fixed share * of the ancestor's claim to protection, any unutilized - * "floating" - protection from up the tree is distributed in * proportion to each counter's *usage*. This makes the protection * neutral wrt sibling cgroups and lets them compete freely over * the shared parental protection budget, but it protects the * subtree as a whole from neighboring subtrees. * * Note that 4. and 5. are not in conflict: 4. is about protecting * against immediate siblings whereas 5. is about protecting against * neighboring subtrees. */ static unsigned long effective_protection(unsigned long usage, unsigned long parent_usage, unsigned long setting, unsigned long parent_effective, unsigned long siblings_protected, bool recursive_protection) { unsigned long protected; unsigned long ep; protected = min(usage, setting); /* * If all cgroups at this level combined claim and use more * protection than what the parent affords them, distribute * shares in proportion to utilization. * * We are using actual utilization rather than the statically * claimed protection in order to be work-conserving: claimed * but unused protection is available to siblings that would * otherwise get a smaller chunk than what they claimed. */ if (siblings_protected > parent_effective) return protected * parent_effective / siblings_protected; /* * Ok, utilized protection of all children is within what the * parent affords them, so we know whatever this child claims * and utilizes is effectively protected. * * If there is unprotected usage beyond this value, reclaim * will apply pressure in proportion to that amount. * * If there is unutilized protection, the cgroup will be fully * shielded from reclaim, but we do return a smaller value for * protection than what the group could enjoy in theory. This * is okay. With the overcommit distribution above, effective * protection is always dependent on how memory is actually * consumed among the siblings anyway. */ ep = protected; /* * If the children aren't claiming (all of) the protection * afforded to them by the parent, distribute the remainder in * proportion to the (unprotected) memory of each cgroup. That * way, cgroups that aren't explicitly prioritized wrt each * other compete freely over the allowance, but they are * collectively protected from neighboring trees. * * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. * * Check both usage and parent_usage against the respective * protected values. One should imply the other, but they * aren't read atomically - make sure the division is sane. */ if (!recursive_protection) return ep; if (parent_effective > siblings_protected && parent_usage > siblings_protected && usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; unclaimed *= usage - protected; unclaimed /= parent_usage - siblings_protected; ep += unclaimed; } return ep; } /** * page_counter_calculate_protection - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @counter: the page_counter the counter to update * @recursive_protection: Whether to use memory_recursiveprot behavior. * * Calculates elow/emin thresholds for given page_counter. * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. */ void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection) { unsigned long usage, parent_usage; struct page_counter *parent = counter->parent; /* * Effective values of the reclaim targets are ignored so they * can be stale. Have a look at mem_cgroup_protection for more * details. * TODO: calculation should be more robust so that we do not need * that special casing. */ if (root == counter) return; usage = page_counter_read(counter); if (!usage) return; if (parent == root) { counter->emin = READ_ONCE(counter->min); counter->elow = READ_ONCE(counter->low); return; } parent_usage = page_counter_read(parent); WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, READ_ONCE(counter->min), READ_ONCE(parent->emin), atomic_long_read(&parent->children_min_usage), recursive_protection)); WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, READ_ONCE(counter->low), READ_ONCE(parent->elow), atomic_long_read(&parent->children_low_usage), recursive_protection)); } #endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
1 461 460 288 2 287 288 275 274 275 274 274 1487 3 3 3 3 3 3 27 27 27 577 577 5 578 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 // SPDX-License-Identifier: GPL-2.0 /* * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> #include <linux/audit.h> #include <linux/random.h> #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) #define TK_UPDATE_ALL (TK_CLEAR_NTP | TK_CLOCK_WAS_SET) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ TK_ADV_TICK, /* Update timekeeper on a direct frequency change */ TK_ADV_FREQ }; /* * The most important data for readout fits into a single 64 byte * cache line. */ struct tk_data { seqcount_raw_spinlock_t seq; struct timekeeper timekeeper; struct timekeeper shadow_timekeeper; raw_spinlock_t lock; } ____cacheline_aligned; static struct tk_data tk_core; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit * is the index for the tk_read_base array * @base: tk_read_base array. Access is indexed by the lowest bit of * @seq. * * See @update_fast_timekeeper() below. */ struct tk_fast { seqcount_latch_t seq; struct tk_read_base base[2]; }; /* Suspend-time cycles value for halted fast timekeeper. */ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { if (timekeeping_suspended) return cycles_at_suspend; return local_clock(); } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; /* * Boot time initialization which allows local_clock() to be utilized * during early boot when clocksources are not available. local_clock() * returns nanoseconds already so no conversion is required, hence mult=1 * and shift=0. When the first proper clocksource is installed then * the fast time keepers are updated with the correct values. */ #define FAST_TK_INIT \ { \ .clock = &dummy_clock, \ .mask = CLOCKSOURCE_MASK(64), \ .mult = 1, \ .shift = 0, \ } static struct tk_fast tk_fast_mono ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq), .base[0] = FAST_TK_INIT, .base[1] = FAST_TK_INIT, }; unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; raw_spin_lock_irqsave(&tk_core.lock, flags); return flags; } void timekeeper_unlock_irqrestore(unsigned long flags) { raw_spin_unlock_irqrestore(&tk_core.lock, flags); } /* * Multigrain timestamps require tracking the latest fine-grained timestamp * that has been issued, and never returning a coarse-grained timestamp that is * earlier than that value. * * mg_floor represents the latest fine-grained time that has been handed out as * a file timestamp on the system. This is tracked as a monotonic ktime_t, and * converted to a realtime clock value on an as-needed basis. * * Maintaining mg_floor ensures the multigrain interfaces never issue a * timestamp earlier than one that has been previously issued. * * The exception to this rule is when there is a backward realtime clock jump. If * such an event occurs, a timestamp can appear to be earlier than a previous one. */ static __cacheline_aligned_in_smp atomic64_t mg_floor; static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; tk->raw_sec++; } } static inline struct timespec64 tk_xtime(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp)); WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0))); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { /* Paired with READ_ONCE() in ktime_mono_to_any() */ WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta)); /* * Timespec representation for VDSO update to avoid 64bit division * on every update. */ tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } /* * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the * seqcount ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. * This isn't necessary to use when holding the tk_core.lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ static inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } /** * tk_setup_internals - Set up internals to use clocksource clock. * * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment * pair and interval request. * * Unless you're the timekeeping code, you should not be using this! */ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { u64 interval; u64 tmp, ntpinterval; struct clocksource *old_clock; ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.mask = clock->mask; tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; interval = (u64) tmp; tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; tk->tkr_raw.xtime_nsec >>= -shift_change; } else { tk->tkr_mono.xtime_nsec <<= shift_change; tk->tkr_raw.xtime_nsec <<= shift_change; } } tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) { return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; /* * This detects both negative motion and the case where the delta * overflows the multiplication with tkr->mult. */ if (unlikely(delta > tkr->clock->max_cycles)) { /* * Handle clocksource inconsistency between CPUs to prevent * time from going backwards by checking for the MSB of the * mask being set in the delta. */ if (delta & ~(mask >> 1)) return tkr->xtime_nsec >> tkr->shift; return delta_to_ns_safe(tkr, delta); } return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift; } static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr)); } /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update * @tkf: Pointer to NMI safe timekeeper * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * * Employ the latch technique; see @write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_fast *tkf) { struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ write_seqcount_latch_begin(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); write_seqcount_latch_end(&tkf->seq); } static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { seq = read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += timekeeping_get_ns(tkr); } while (read_seqcount_latch_retry(&tkf->seq, seq)); return now; } /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * * This timestamp is not guaranteed to be monotonic across an update. * The timestamp is calculated by: * * now = base_mono + clock_delta * slope * * So if the update lowers the slope, readers who are forced to the * not yet updated second array are still using the old steeper slope. * * tmono * ^ * | o n * | o n * | u * | o * |o * |12345678---> reader order * * o = old slope * u = update * n = new slope * * So reader 6 will observe time going backwards versus reader 5. * * While other CPUs are likely to be able to observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ u64 notrace ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); /** * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw * * Contrary to ktime_get_mono_fast_ns() this is always correct because the * conversion factor is not affected by NTP/PTP correction. */ u64 notrace ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); /** * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset * protected with seqcounts. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset * is added to the old timekeeping making the clock appear to update slightly * earlier: * CPU 0 CPU 1 * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); * timekeeping_update_staged(tkd, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. * * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns() * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot))); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); /** * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock. * * The same limitations as described for ktime_get_boot_fast_ns() apply. The * mono time and the TAI offset are not read atomically which may yield wrong * readouts. However, an update of the TAI offset is an rare event e.g., caused * by settime or adjtimex with an offset. The user of this function has to deal * with the possibility of wrong timestamps in post processing. */ u64 notrace ktime_get_tai_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai))); } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. * * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); return baser + delta; } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * * It generally is unsafe to access the clocksource after timekeeping has been * suspended, so take a snapshot of the readout base of @tk and use it as the * fast timekeeper's readout base while suspended. It will return the same * number of cycles every time until timekeeping is resumed at which time the * proper readout base for the fast timekeeper will be restored automatically. */ static void halt_fast_timekeeper(const struct timekeeper *tk) { static struct tk_read_base tkr_dummy; const struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener * @nb: Pointer to the notifier block to register */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; int ret; guard(raw_spinlock_irqsave)(&tk_core.lock); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener * @nb: Pointer to the notifier block to unregister */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { guard(raw_spinlock_irqsave)(&tk_core.lock); return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* * tk_update_leap_state - helper to update the next_leap_ktime */ static inline void tk_update_leap_state(struct timekeeper *tk) { tk->next_leap_ktime = ntp_get_next_leap(); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* * Leap state update for both shadow and the real timekeeper * Separate to spare a full memcpy() of the timekeeper. */ static void tk_update_leap_state_all(struct tk_data *tkd) { write_seqcount_begin(&tkd->seq); tk_update_leap_state(&tkd->shadow_timekeeper); tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime; write_seqcount_end(&tkd->seq); } /* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) { u64 seconds; u32 nsec; /* * The xtime based monotonic readout is: * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); * The ktime based monotonic readout is: * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; /* Update the monotonic raw base */ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* * Restore the shadow timekeeper from the real timekeeper. */ static void timekeeping_restore_shadow(struct tk_data *tkd) { lockdep_assert_held(&tkd->lock); memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper)); } static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { struct timekeeper *tk = &tk_core.shadow_timekeeper; lockdep_assert_held(&tkd->lock); /* * Block out readers before running the updates below because that * updates VDSO and other time related infrastructure. Not blocking * the readers might let a reader see time going backwards when * reading from the VDSO after the VDSO update and then reading in * the kernel from the timekeeper before that got updated. */ write_seqcount_begin(&tkd->seq); if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); } tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; /* * Update the real timekeeper. * * We could avoid this memcpy() by switching pointers, but that has * the downside that the reader side does not longer benefit from * the cacheline optimized data layout of the timekeeper and requires * another indirection. */ memcpy(&tkd->timekeeper, tk, sizeof(*tk)); write_seqcount_end(&tkd->seq); } /** * timekeeping_forward_now - update clock to the current time * @tk: Pointer to the timekeeper to update * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; while (delta > 0) { u64 max = tk->tkr_mono.clock->max_cycles; u64 incr = delta < max ? delta : max; tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult; tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult; tk_normalize_xtime(tk); delta -= incr; } } /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec64 (WARN if suspended). */ void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); u32 ktime_get_resolution_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u32 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return nsecs; } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** * ktime_mono_to_any() - convert monotonic time to any other time * @tmono: time to convert. * @offs: which offset to use */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; if (IS_ENABLED(CONFIG_64BIT)) { /* * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and * tk_update_sleep_time(). */ return ktime_add(tmono, READ_ONCE(*offset)); } do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); } while (read_seqcount_retry(&tk_core.seq, seq)); return tconv; } EXPORT_SYMBOL_GPL(ktime_mono_to_any); /** * ktime_get_raw - Returns the raw monotonic time in ktime_t format */ ktime_t ktime_get_raw(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_raw.base; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_raw); /** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; unsigned int seq; u64 nsec; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; timespec64_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts64); /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non * serialized read. tk->ktime_sec is of type 'unsigned long' so this * works on both 32 and 64 bit systems. On 32 bit systems the readout * covers ~136 years of uptime which should be enough to prevent * premature wrap arounds. */ time64_t ktime_get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; WARN_ON(timekeeping_suspended); return tk->ktime_sec; } EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * * Returns the wall clock seconds since 1970. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence * counter to provide "atomic" access to the 64bit tk->xtime_sec * value. */ time64_t ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; time64_t seconds; unsigned int seq; if (IS_ENABLED(CONFIG_64BIT)) return tk->xtime_sec; do { seq = read_seqcount_begin(&tk_core.seq); seconds = tk->xtime_sec; } while (read_seqcount_retry(&tk_core.seq, seq)); return seconds; } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** * __ktime_get_real_seconds - The same as ktime_get_real_seconds * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ noinstr time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } /** * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter * @systime_snapshot: pointer to struct receiving the system time snapshot */ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base_raw; ktime_t base_real; ktime_t base_boot; u64 nsec_raw; u64 nsec_real; u64 now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_id = tk->tkr_mono.clock->id; systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_boot = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_boot); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); } while (read_seqcount_retry(&tk_core.seq, seq)); systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); /* Scale base by mult/div checking for overflow */ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) { u64 tmp, rem; tmp = div64_u64_rem(*base, div, &rem); if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } /** * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval * @history: Snapshot representing start of history * @partial_history_cycles: Cycle offset into history (fractional part) * @total_history_cycles: Total history length in cycles * @discontinuity: True indicates clock was set on history period * @ts: Cross timestamp that should be adjusted using * partial/total ratio * * Helper function used by get_device_system_crosststamp() to correct the * crosstimestamp corresponding to the start of the current interval to the * system counter value (timestamp point) provided by the driver. The * total_history_* quantities are the total history starting at the provided * reference point and ending at the start of the current interval. The cycle * count between the driver timestamp point and the start of the current * interval is partial_history_cycles. */ static int adjust_historical_crosststamp(struct system_time_snapshot *history, u64 partial_history_cycles, u64 total_history_cycles, bool discontinuity, struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 corr_raw, corr_real; bool interp_forward; int ret; if (total_history_cycles == 0 || partial_history_cycles == 0) return 0; /* Interpolate shortest distance from beginning or end of history */ interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; /* * Scale the monotonic raw time delta by: * partial_history_cycles / total_history_cycles */ corr_raw = (u64)ktime_to_ns( ktime_sub(ts->sys_monoraw, history->raw)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_raw); if (ret) return ret; /* * If there is a discontinuity in the history, scale monotonic raw * correction by: * mult(real)/mult(raw) yielding the realtime correction * Otherwise, calculate the realtime correction similar to monotonic * raw calculation */ if (discontinuity) { corr_real = mul_u64_u32_div (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); } else { corr_real = (u64)ktime_to_ns( ktime_sub(ts->sys_realtime, history->real)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_real); if (ret) return ret; } /* Fixup monotonic raw and real time time values */ if (interp_forward) { ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); ts->sys_realtime = ktime_add_ns(history->real, corr_real); } else { ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); } return 0; } /* * timestamp_in_interval - true if ts is chronologically in [start, end] * * True if ts occurs chronologically at or after start, and before or at end. */ static bool timestamp_in_interval(u64 start, u64 end, u64 ts) { if (ts >= start && ts <= end) return true; if (start > end && (ts >= start || ts <= end)) return true; return false; } static bool convert_clock(u64 *val, u32 numerator, u32 denominator) { u64 rem, res; if (!numerator || !denominator) return false; res = div64_u64_rem(*val, denominator, &rem) * numerator; *val = res + div_u64(rem * numerator, denominator); return true; } static bool convert_base_to_cs(struct system_counterval_t *scv) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; u32 num, den; /* The timestamp was taken from the time keeper clock source */ if (cs->id == scv->cs_id) return true; /* * Check whether cs_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != scv->cs_id) return false; num = scv->use_nsecs ? cs->freq_khz : base->numerator; den = scv->use_nsecs ? USEC_PER_SEC : base->denominator; if (!convert_clock(&scv->cycles, num, den)) return false; scv->cycles += base->offset; return true; } static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id) { struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock; struct clocksource_base *base; /* * Check whether base_id matches the base clock. Prevent the compiler from * re-evaluating @base as the clocksource might change concurrently. */ base = READ_ONCE(cs->base); if (!base || base->id != base_id) return false; *cycles -= base->offset; if (!convert_clock(cycles, base->denominator, base->numerator)) return false; return true; } static bool convert_ns_to_cs(u64 *delta) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta)) return false; *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult); return true; } /** * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp * @treal: CLOCK_REALTIME timestamp to convert * @base_id: base clocksource id * @cycles: pointer to store the converted base clock timestamp * * Converts a supplied, future realtime clock value to the corresponding base clock value. * * Return: true if the conversion is successful, false otherwise. */ bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 delta; do { seq = read_seqcount_begin(&tk_core.seq); if ((u64)treal < tk->tkr_mono.base_real) return false; delta = (u64)treal - tk->tkr_mono.base_real; if (!convert_ns_to_cs(&delta)) return false; *cycles = tk->tkr_mono.cycle_last + delta; if (!convert_cs_to_base(cycles, base_id)) return false; } while (read_seqcount_retry(&tk_core.seq, seq)); return true; } EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver * @ctx: Context passed to get_time_fn() * @history_begin: Historical reference point used to interpolate system * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time */ int get_device_system_crosststamp(int (*get_time_fn) (ktime_t *device_time, struct system_counterval_t *sys_counterval, void *ctx), void *ctx, struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned int seq; bool do_interp; int ret; do { seq = read_seqcount_begin(&tk_core.seq); /* * Try to synchronously capture device time and a system * counter value calling back into the device driver */ ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); if (ret) return ret; /* * Verify that the clocksource ID associated with the captured * system counter value is the same as for the currently * installed timekeeper clocksource */ if (system_counterval.cs_id == CSID_GENERIC || !convert_base_to_cs(&system_counterval)) return -ENODEV; cycles = system_counterval.cycles; /* * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!timestamp_in_interval(interval_start, now, cycles)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; do_interp = true; } else { do_interp = false; } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); /* * Interpolate if necessary, adjusting back from the start of the * current interval */ if (do_interp) { u64 partial_history_cycles, total_history_cycles; bool discontinuity; /* * Check that the counter value is not before the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || !timestamp_in_interval(history_begin->cycles, cycles, system_counterval.cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; total_history_cycles = cycles - history_begin->cycles; discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq; ret = adjust_historical_crosststamp(history_begin, partial_history_cycles, total_history_cycles, discontinuity, xtstamp); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * timekeeping_clocksource_has_base - Check whether the current clocksource * is based on given a base clock * @id: base clocksource ID * * Note: The return value is a snapshot which can become invalid right * after the function returns. * * Return: true if the timekeeper clocksource has a base clock with @id, * false otherwise */ bool timekeeping_clocksource_has_base(enum clocksource_ids id) { /* * This is a snapshot, so no point in using the sequence * count. Just prevent the compiler from re-evaluating @base as the * clocksource might change concurrently. */ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base); return base ? base->id == id : false; } EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base); /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ int do_settimeofday64(const struct timespec64 *ts) { struct timespec64 ts_delta, xt; if (!timespec64_valid_settod(ts)) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); xt = tk_xtime(tks); ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta)); tk_set_xtime(tks, ts); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); audit_tk_injoffset(ts_delta); add_device_randomness(ts, sizeof(*ts)); return 0; } EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ static int timekeeping_inject_offset(const struct timespec64 *ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 tmp; timekeeping_forward_now(tks); /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { timekeeping_restore_shadow(&tk_core); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL); return 0; } /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. */ int persistent_clock_is_local; /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * * This is ugly, but preferable to the alternatives. Otherwise we * would either need to write a program to do it in /etc/rc (and risk * confusion if the program gets run more than once; it would also be * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; adjust.tv_nsec = 0; timekeeping_inject_offset(&adjust); } } /* * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /* * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource */ static int change_clocksource(void *data) { struct clocksource *new = data, *old = NULL; /* * If the clocksource is in a module, get a module reference. * Succeeds for built-in code (owner == NULL) as well. Abort if the * reference can't be acquired. */ if (!try_module_get(new->owner)) return 0; /* Abort if the device can't be enabled */ if (new->enable && new->enable(new) != 0) { module_put(new->owner); return 0; } scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; timekeeping_forward_now(tks); old = tks->tkr_mono.clock; tk_setup_internals(tks, new); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } if (old) { if (old->disable) old->disable(old); module_put(old->owner); } return 0; } /** * timekeeping_notify - Install a new clock source * @clock: pointer to the clock source * * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } /** * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_raw_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; int ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred */ u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u64 ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * read_persistent_clock64 - Return time from the persistent clock. * @ts: Pointer to the storage for the readout value * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ void __weak read_persistent_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. * @wall_time: current time as returned by persistent clock * @boot_offset: offset that is defined as wall_time - boot_time * * Weak dummy function for arches that do not yet support it. * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); *boot_offset = ns_to_timespec64(local_clock()); } static __init void tkd_basic_setup(struct tk_data *tkd) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); } /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * * The flag starts of false and is only set when a suspend reaches * timekeeping_suspend(), timekeeping_resume() sets it to false when the * timekeeper clocksource is not stopping across suspend and has been * used to update sleep time. If the timekeeper clocksource has stopped * then the flag stays true and is used by the RTC resume code to decide * whether sleeptime must be injected and if so the flag gets false then. * * If a suspend fails before reaching timekeeping_resume() then the flag * stays false and prevents erroneous sleeptime injection. */ static bool suspend_timing_needed; /* Flag for if there is a persistent clock on this platform */ static bool persistent_clock_exists; /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; tkd_basic_setup(&tk_core); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } if (timespec64_compare(&wall_time, &boot_offset) < 0) boot_offset = (struct timespec64){0}; /* * We want set wall_to_mono, so the following is true: * wall time + wall_to_mono = boot time */ wall_to_mono = timespec64_sub(boot_offset, wall_time); guard(raw_spinlock_irqsave)(&tk_core.lock); ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); tk_setup_internals(tks, clock); tk_set_xtime(tks, &wall_time); tks->raw_sec = 0; tk_set_wall_to_mono(tks, wall_to_mono); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); } /* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @tk: Pointer to the timekeeper to be updated * @delta: Pointer to the delta value in timespec64 format * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /* * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource * 2) persistent clock (ie: RTC accessible when irqs are off) * 3) RTC * * 1) and 2) are used by timekeeping, 3) by RTC subsystem. * If system has neither 1) nor 2), 3) will be used finally. * * * If timekeeping has injected sleeptime via either 1) or 2), * 3) becomes needless, so in this case we don't need to call * rtc_resume(), and this is what timekeeping_rtc_skipresume() * means. */ bool timekeeping_rtc_skipresume(void) { return !suspend_timing_needed; } /* * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). * * But if system has 2), 2) will definitely be used, so in this * case we don't need to call rtc_suspend(), and this is what * timekeeping_rtc_skipsuspend() means. */ bool timekeeping_rtc_skipsuspend(void) { return persistent_clock_exists; } /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { scoped_guard(raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; suspend_timing_needed = false; timekeeping_forward_now(tks); __timekeeping_inject_sleeptime(tks, delta); timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } /* Signal hrtimers about time change */ clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. */ void timekeeping_resume(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock = tks->tkr_mono.clock; struct timespec64 ts_new, ts_delta; bool inject_sleeptime = false; u64 cycle_now, nsec; unsigned long flags; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&tk_core.lock, flags); /* * After system resumes, we need to calculate the suspended time and * compensate it for the OS time. There are 3 sources that could be * used: Nonstop clocksource during suspend, persistent clock and rtc * device. * * One specific platform may have 1 or 2 or all of them, and the * preference will be: * suspend-nonstop clocksource -> persistent clock -> rtc * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tks->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); inject_sleeptime = true; } if (inject_sleeptime) { suspend_timing_needed = false; __timekeeping_inject_sleeptime(tks, &ts_delta); } /* Re-base the last cycle value */ tks->tkr_mono.cycle_last = cycle_now; tks->tkr_raw.cycle_last = cycle_now; tks->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); raw_spin_unlock_irqrestore(&tk_core.lock, flags); touch_softlockup_watchdog(); /* Resume the clockevent device(s) and hrtimers */ tick_resume(); /* Notify timerfd as resume is equivalent to clock_was_set() */ timerfd_resume(); } int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; struct clocksource *curr_clock; unsigned long flags; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at * timekeeping_init by its return value, so if we see a valid * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) persistent_clock_exists = true; suspend_timing_needed = true; raw_spin_lock_irqsave(&tk_core.lock, flags); timekeeping_forward_now(tks); timekeeping_suspended = 1; /* * Since we've called forward_now, cycle_last stores the value * just read from the current clocksource. Save this to potentially * use in suspend timing. */ curr_clock = tks->tkr_mono.clock; cycle_now = tks->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, * which each can add ~1 second drift error, * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction * has occurred and set old_delta to the current delta. */ old_delta = delta; } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = timespec64_add(timekeeping_suspend_time, delta_delta); } } timekeeping_update_from_shadow(&tk_core, 0); halt_fast_timekeeper(tks); raw_spin_unlock_irqrestore(&tk_core.lock, flags); tick_suspend(); clocksource_suspend(); clockevents_suspend(); return 0; } /* sysfs resume/suspend bits for timekeeping */ static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; static int __init timekeeping_init_ops(void) { register_syscore_ops(&timekeeping_syscore_ops); return 0; } device_initcall(timekeeping_init_ops); /* * Apply a multiplier adjustment to the timekeeper */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, s32 mult_adj) { s64 interval = tk->cycle_interval; if (mult_adj == 0) { return; } else if (mult_adj == -1) { interval = -interval; offset = -offset; } else if (mult_adj != 1) { interval *= mult_adj; offset *= mult_adj; } /* * So the following can be confusing. * * To keep things simple, lets assume mult_adj == 1 for now. * * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier * by one, this causes the xtime_interval to be incremented by * one cycle_interval. This is because: * xtime_interval = cycle_interval * mult * So if mult is being incremented by one: * xtime_interval = cycle_interval * (mult + 1) * Its the same as: * xtime_interval = (cycle_interval * mult) + cycle_interval * Which can be shortened to: * xtime_interval += cycle_interval * * So offset stores the non-accumulated cycles. Thus the current * time (in shifted nanoseconds) is: * now = (offset * adj) + xtime_nsec * Now, even though we're adjusting the clock frequency, we have * to keep time consistent. In other words, we can't jump back * in time, and we also want to avoid jumping forward in time. * * So given the same offset value, we need the time to be the same * both before and after the freq adjustment. * now = (offset * adj_1) + xtime_nsec_1 * now = (offset * adj_2) + xtime_nsec_2 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_2) + xtime_nsec_2 * And we know: * adj_2 = adj_1 + 1 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * (adj_1+1)) + xtime_nsec_2 * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_1) + offset + xtime_nsec_2 * Canceling the sides: * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplifies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; } /* * Adjust the timekeeper's multiplier to the correct frequency * and also to reduce the accumulated error value. */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { u64 ntp_tl = ntp_tick_length(); u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ if (likely(tk->ntp_tick == ntp_tl)) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { tk->ntp_tick = ntp_tl; mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } /* * If the clock is behind the NTP time, increase the multiplier by 1 * to catch up with it. If it's ahead and there was a remainder in the * tick division, the clock will slow down. Otherwise it will stay * ahead until the tick length changes to a non-divisible value. */ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; mult += tk->ntp_err_mult; timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * * Now, since we have already accumulated the second and the NTP * subsystem has been notified via second_overflow(), we need to skip * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec--; tk->skip_second_overflow = 1; } } /* * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* * Skip NTP update if this second was accumulated before, * i.e. xtime_nsec underflowed in timekeeping_adjust() */ if (unlikely(tk->skip_second_overflow)) { tk->skip_second_overflow = 0; continue; } /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_set = TK_CLOCK_WAS_SET; } } return clock_set; } /* * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into * a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. */ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; /* Accumulate one shifted interval */ offset -= interval; tk->tkr_mono.cycle_last += interval; tk->tkr_raw.cycle_last += interval; tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; tk->raw_sec++; } /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset; guard(raw_spinlock_irqsave)(&tk_core.lock); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ clock_set |= accumulate_nsecs_to_secs(tk); timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; } /** * update_wall_time - Uses the current clocksource to increment the wall time * */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); } /** * getboottime64 - Return the real time of system boot. * @ts: pointer to the timespec64 to be set * * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); /** * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor * @ts: timespec64 to be filled * * Fetch the global mg_floor value, convert it to realtime and compare it * to the current coarse-grained time. Fill @ts with whichever is * latest. Note that this is a filesystem-specific interface and should be * avoided outside of that context. */ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 floor = atomic64_read(&mg_floor); ktime_t f_real, offset, coarse; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); coarse = timespec64_to_ktime(*ts); f_real = ktime_add(floor, offset); if (ktime_after(f_real, coarse)) *ts = ktime_to_timespec64(f_real); } /** * ktime_get_real_ts64_mg - attempt to update floor value and return result * @ts: pointer to the timespec to be set * * Get a monotonic fine-grained time value and attempt to swap it into * mg_floor. If that succeeds then accept the new floor value. If it fails * then another task raced in during the interim time and updated the * floor. Since any update to the floor must be later than the previous * floor, either outcome is acceptable. * * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(), * and determining that the resulting coarse-grained timestamp did not effect * a change in ctime. Any more recent floor value would effect a change to * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure. * * @ts will be filled with the latest floor value, regardless of the outcome of * the cmpxchg. Note that this is a filesystem specific interface and should be * avoided outside of that context. */ void ktime_get_real_ts64_mg(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t old = atomic64_read(&mg_floor); ktime_t offset, mono; unsigned int seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; mono = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); mono = ktime_add_ns(mono, nsecs); /* * Attempt to update the floor with the new time value. As any * update must be later then the existing floor, and would effect * a change to ctime from the perspective of the current task, * accept the resulting floor value regardless of the outcome of * the swap. */ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) { ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); timekeeping_inc_mg_floor_swaps(); } else { /* * Another task changed mg_floor since "old" was fetched. * "old" has been updated with the latest value of "mg_floor". * That value is newer than the previous floor value, which * is enough to effect a change to ctime. Accept it. */ *ts = ktime_to_timespec64(ktime_add(old, offset)); } } void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { jiffies_64 += ticks; calc_global_load(); } /** * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the * sequence number in @cwsseq and timekeeper.clock_was_set_seq are * different. * * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); base = ktime_add_ns(base, nsecs); if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } /* Handle leapsecond insertion adjustments */ if (unlikely(base >= tk->next_leap_ktime)) *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); } while (read_seqcount_retry(&tk_core.seq, seq)); return base; } /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct __kernel_timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; } if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; /* * Validate if a timespec/timeval used to inject a time * offset is valid. Offsets can be positive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and * we can't have more nanoseconds/microseconds than a second. */ if (txc->time.tv_usec < 0) return -EINVAL; if (txc->modes & ADJ_NANO) { if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; } else { if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } /* * Check for potential multiplication overflows that can * only happen on 64-bit systems: */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } return 0; } /** * random_get_entropy_fallback - Returns the raw clock source value, * used by random.c for platforms with no valid random_get_entropy(). */ unsigned long random_get_entropy_fallback(void) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; struct clocksource *clock = READ_ONCE(tkr->clock); if (unlikely(timekeeping_suspended || !clock)) return 0; return clock->read(clock); } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function * @txc: Pointer to kernel_timex structure containing NTP parameters */ int do_adjtimex(struct __kernel_timex *txc) { struct audit_ntp_data ad; bool offset_set = false; bool clock_set = false; struct timespec64 ts; int ret; /* Validate the data before disabling interrupts */ ret = timekeeping_validate_timex(txc); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; ret = timekeeping_inject_offset(&delta); if (ret) return ret; offset_set = delta.tv_sec != 0; audit_tk_injoffset(delta); } audit_ntp_init(&ad); ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { struct timekeeper *tks = &tk_core.shadow_timekeeper; s32 orig_tai, tai; orig_tai = tai = tks->tai_offset; ret = __do_adjtimex(txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); clock_set = true; } else { tk_update_leap_state_all(&tk_core); } } audit_ntp_log(&ad); /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(offset_set); return ret; } #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function * @phase_ts: Pointer to timespec64 structure representing phase timestamp * @raw_ts: Pointer to timespec64 structure representing raw timestamp */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { guard(raw_spinlock_irqsave)(&tk_core.lock); __hardpps(phase_ts, raw_ts); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_FLOW_DISSECTOR_H #define _NET_FLOW_DISSECTOR_H #include <linux/types.h> #include <linux/in6.h> #include <linux/siphash.h> #include <linux/string.h> #include <uapi/linux/if_ether.h> #include <uapi/linux/pkt_cls.h> struct bpf_prog; struct net; struct sk_buff; /** * struct flow_dissector_key_control: * @thoff: Transport header offset * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_* * @flags: Key flags. * Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAG|ENCAPSULATION|F_*) */ struct flow_dissector_key_control { u16 thoff; u16 addr_type; u32 flags; }; /* The control flags are kept in sync with TCA_FLOWER_KEY_FLAGS_*, as those * flags are exposed to userspace in some error paths, ie. unsupported flags. */ enum flow_dissector_ctrl_flags { FLOW_DIS_IS_FRAGMENT = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_FIRST_FRAG = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, FLOW_DIS_F_TUNNEL_CSUM = TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM, FLOW_DIS_F_TUNNEL_DONT_FRAGMENT = TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT, FLOW_DIS_F_TUNNEL_OAM = TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM, FLOW_DIS_F_TUNNEL_CRIT_OPT = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT, /* These flags are internal to the kernel */ FLOW_DIS_ENCAPSULATION = (TCA_FLOWER_KEY_FLAGS_MAX << 1), }; enum flow_dissect_ret { FLOW_DISSECT_RET_OUT_GOOD, FLOW_DISSECT_RET_OUT_BAD, FLOW_DISSECT_RET_PROTO_AGAIN, FLOW_DISSECT_RET_IPPROTO_AGAIN, FLOW_DISSECT_RET_CONTINUE, }; /** * struct flow_dissector_key_basic: * @n_proto: Network header protocol (eg. IPv4/IPv6) * @ip_proto: Transport header protocol (eg. TCP/UDP) * @padding: Unused */ struct flow_dissector_key_basic { __be16 n_proto; u8 ip_proto; u8 padding; }; struct flow_dissector_key_tags { u32 flow_label; }; struct flow_dissector_key_vlan { union { struct { u16 vlan_id:12, vlan_dei:1, vlan_priority:3; }; __be16 vlan_tci; }; __be16 vlan_tpid; __be16 vlan_eth_type; u16 padding; }; struct flow_dissector_mpls_lse { u32 mpls_ttl:8, mpls_bos:1, mpls_tc:3, mpls_label:20; }; #define FLOW_DIS_MPLS_MAX 7 struct flow_dissector_key_mpls { struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */ u8 used_lses; /* One bit set for each Label Stack Entry in use */ }; static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls, int lse_index) { mpls->used_lses |= 1 << lse_index; } #define FLOW_DIS_TUN_OPTS_MAX 255 /** * struct flow_dissector_key_enc_opts: * @data: tunnel option data * @len: length of tunnel option data * @dst_opt_type: tunnel option type */ struct flow_dissector_key_enc_opts { u8 data[FLOW_DIS_TUN_OPTS_MAX]; /* Using IP_TUNNEL_OPTS_MAX is desired * here but seems difficult to #include */ u8 len; u32 dst_opt_type; }; struct flow_dissector_key_keyid { __be32 keyid; }; /** * struct flow_dissector_key_ipv4_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv4_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ __be32 src; __be32 dst; }; /** * struct flow_dissector_key_ipv6_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv6_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ struct in6_addr src; struct in6_addr dst; }; /** * struct flow_dissector_key_tipc: * @key: source node address combined with selector */ struct flow_dissector_key_tipc { __be32 key; }; /** * struct flow_dissector_key_addrs: * @v4addrs: IPv4 addresses * @v6addrs: IPv6 addresses * @tipckey: TIPC key */ struct flow_dissector_key_addrs { union { struct flow_dissector_key_ipv4_addrs v4addrs; struct flow_dissector_key_ipv6_addrs v6addrs; struct flow_dissector_key_tipc tipckey; }; }; /** * struct flow_dissector_key_arp: * @sip: Sender IP address * @tip: Target IP address * @op: Operation * @sha: Sender hardware address * @tha: Target hardware address */ struct flow_dissector_key_arp { __u32 sip; __u32 tip; __u8 op; unsigned char sha[ETH_ALEN]; unsigned char tha[ETH_ALEN]; }; /** * struct flow_dissector_key_ports: * @ports: port numbers of Transport header * @src: source port number * @dst: destination port number */ struct flow_dissector_key_ports { union { __be32 ports; struct { __be16 src; __be16 dst; }; }; }; /** * struct flow_dissector_key_ports_range * @tp: port number from packet * @tp_min: min port number in range * @tp_max: max port number in range */ struct flow_dissector_key_ports_range { union { struct flow_dissector_key_ports tp; struct { struct flow_dissector_key_ports tp_min; struct flow_dissector_key_ports tp_max; }; }; }; /** * struct flow_dissector_key_icmp: * @type: ICMP type * @code: ICMP code * @id: Session identifier */ struct flow_dissector_key_icmp { struct { u8 type; u8 code; }; u16 id; }; /** * struct flow_dissector_key_eth_addrs: * @src: source Ethernet address * @dst: destination Ethernet address */ struct flow_dissector_key_eth_addrs { /* (dst,src) must be grouped, in the same way than in ETH header */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; }; /** * struct flow_dissector_key_tcp: * @flags: flags */ struct flow_dissector_key_tcp { __be16 flags; }; /** * struct flow_dissector_key_ip: * @tos: tos * @ttl: ttl */ struct flow_dissector_key_ip { __u8 tos; __u8 ttl; }; /** * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex * @ingress_iftype: ingress interface type * @l2_miss: packet did not match an L2 entry during forwarding */ struct flow_dissector_key_meta { int ingress_ifindex; u16 ingress_iftype; u8 l2_miss; }; /** * struct flow_dissector_key_ct: * @ct_state: conntrack state after converting with map * @ct_mark: conttrack mark * @ct_zone: conntrack zone * @ct_labels: conntrack labels */ struct flow_dissector_key_ct { u16 ct_state; u16 ct_zone; u32 ct_mark; u32 ct_labels[4]; }; /** * struct flow_dissector_key_hash: * @hash: hash value */ struct flow_dissector_key_hash { u32 hash; }; /** * struct flow_dissector_key_num_of_vlans: * @num_of_vlans: num_of_vlans value */ struct flow_dissector_key_num_of_vlans { u8 num_of_vlans; }; /** * struct flow_dissector_key_pppoe: * @session_id: pppoe session id * @ppp_proto: ppp protocol * @type: pppoe eth type */ struct flow_dissector_key_pppoe { __be16 session_id; __be16 ppp_proto; __be16 type; }; /** * struct flow_dissector_key_l2tpv3: * @session_id: identifier for a l2tp session */ struct flow_dissector_key_l2tpv3 { __be32 session_id; }; /** * struct flow_dissector_key_ipsec: * @spi: identifier for a ipsec connection */ struct flow_dissector_key_ipsec { __be32 spi; }; /** * struct flow_dissector_key_cfm * @mdl_ver: maintenance domain level (mdl) and cfm protocol version * @opcode: code specifying a type of cfm protocol packet * * See 802.1ag, ITU-T G.8013/Y.1731 * 1 2 * |7 6 5 4 3 2 1 0|7 6 5 4 3 2 1 0| * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | mdl | version | opcode | * +-----+---------+-+-+-+-+-+-+-+-+ */ struct flow_dissector_key_cfm { u8 mdl_ver; u8 opcode; }; #define FLOW_DIS_CFM_MDL_MASK GENMASK(7, 5) #define FLOW_DIS_CFM_MDL_MAX 7 enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */ FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */ FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */ FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */ FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */ FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */ FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */ FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */ FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */ FLOW_DISSECTOR_KEY_MAX, }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) #define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP BIT(3) struct flow_dissector_key { enum flow_dissector_key_id key_id; size_t offset; /* offset of struct flow_dissector_key_* in target the struct */ }; struct flow_dissector { unsigned long long used_keys; /* each bit represents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; struct flow_keys_basic { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; }; struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT); struct flow_dissector_key_tags tags; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; #define FLOW_KEYS_HASH_OFFSET \ offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD) __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * * This structure is used to hold a digest of the full flow keys. This is a * larger "hash" of a flow to allow definitively matching specific flows where * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so * that it can be used in CB of skb (see sch_choke for an example). */ #define FLOW_KEYS_DIGEST_LEN 16 struct flow_keys_digest { u8 data[FLOW_KEYS_DIGEST_LEN]; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); static inline bool flow_keys_have_l4(const struct flow_keys *keys) { return (keys->ports.ports || keys->tags.flow_label); } u32 flow_hash_from_keys(struct flow_keys *keys); u32 flow_hash_from_keys_seed(struct flow_keys *keys, const siphash_key_t *keyval); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1ULL << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id, void *target_container) { return ((char *)target_container) + flow_dissector->offset[key_id]; } struct bpf_flow_dissector { struct bpf_flow_keys *flow_keys; const struct sk_buff *skb; const void *data; const void *data_end; }; static inline void flow_dissector_init_keys(struct flow_dissector_key_control *key_control, struct flow_dissector_key_basic *key_basic) { memset(key_control, 0, sizeof(*key_control)); memset(key_basic, 0, sizeof(*key_basic)); } #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif
1 4 4 4 4 19 45 3 34 6 3 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2015, 2016 ARM Ltd. */ #ifndef __KVM_ARM_VGIC_NEW_H__ #define __KVM_ARM_VGIC_NEW_H__ #include <linux/irqchip/arm-gic-common.h> #include <asm/kvm_mmu.h> #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b #define VGIC_ADDR_UNDEF (-1) #define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) #define INTERRUPT_ID_BITS_SPIS 10 #define INTERRUPT_ID_BITS_ITS 16 #define VGIC_LPI_MAX_INTID ((1 << INTERRUPT_ID_BITS_ITS) - 1) #define VGIC_PRI_BITS 5 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) #define VGIC_AFFINITY_0_SHIFT 0 #define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT) #define VGIC_AFFINITY_1_SHIFT 8 #define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT) #define VGIC_AFFINITY_2_SHIFT 16 #define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT) #define VGIC_AFFINITY_3_SHIFT 24 #define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT) #define VGIC_AFFINITY_LEVEL(reg, level) \ ((((reg) & VGIC_AFFINITY_## level ##_MASK) \ >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) /* * The Userspace encodes the affinity differently from the MPIDR, * Below macro converts vgic userspace format to MPIDR reg format. */ #define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \ VGIC_AFFINITY_LEVEL(val, 1) | \ VGIC_AFFINITY_LEVEL(val, 2) | \ VGIC_AFFINITY_LEVEL(val, 3)) /* * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 #define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14 #define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800 #define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11 #define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780 #define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7 #define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078 #define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3 #define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007 #define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0 #define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \ KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \ KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \ KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* * As per Documentation/virt/kvm/devices/arm-vgic-its.rst, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 #define KVM_ITS_CTE_VALID_MASK BIT_ULL(63) #define KVM_ITS_CTE_RDBASE_SHIFT 16 #define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0) #define KVM_ITS_ITE_NEXT_SHIFT 48 #define KVM_ITS_ITE_PINTID_SHIFT 16 #define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16) #define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0) #define KVM_ITS_DTE_VALID_SHIFT 63 #define KVM_ITS_DTE_VALID_MASK BIT_ULL(63) #define KVM_ITS_DTE_NEXT_SHIFT 49 #define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49) #define KVM_ITS_DTE_ITTADDR_SHIFT 5 #define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5) #define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0) #define KVM_ITS_L1E_VALID_MASK BIT_ULL(63) /* we only support 64 kB translation table page size */ #define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16) #define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0) #define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12) #define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12 #define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16) #define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52) #define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52 #ifdef CONFIG_DEBUG_SPINLOCK #define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p) #else #define DEBUG_SPINLOCK_BUG_ON(p) #endif static inline u32 vgic_get_implementation_rev(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.vgic.implementation_rev; } /* Requires the irq_lock to be held by the caller. */ static inline bool irq_is_pending(struct vgic_irq *irq) { if (irq->config == VGIC_CONFIG_EDGE) return irq->pending_latch; else return irq->pending_latch || irq->line_level; } static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) { return irq->config == VGIC_CONFIG_LEVEL && irq->hw; } static inline int vgic_irq_get_lr_count(struct vgic_irq *irq) { /* Account for the active state as an interrupt */ if (vgic_irq_is_sgi(irq->intid) && irq->source) return hweight8(irq->source) + irq->active; return irq_is_pending(irq) || irq->active; } static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) { return vgic_irq_get_lr_count(irq) > 1; } static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) { struct vgic_dist *dist = &kvm->arch.vgic; int ret; dist->table_write_in_progress = true; ret = kvm_write_guest_lock(kvm, gpa, data, len); dist->table_write_in_progress = false; return ret; } /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC * state to userspace can generate either GICv2 or GICv3 CPU interface * registers regardless of the hardware backed GIC used. */ struct vgic_vmcr { u32 grpen0; u32 grpen1; u32 ackctl; u32 fiqen; u32 cbpr; u32 eoim; u32 abpr; u32 bpr; u32 pmr; /* Priority mask field in the GICC_PMR and * ICC_PMR_EL1 priority field format */ }; struct vgic_reg_attr { struct kvm_vcpu *vcpu; gpa_t addr; }; struct its_device { struct list_head dev_list; /* the head for the list of ITTEs */ struct list_head itt_head; u32 num_eventid_bits; gpa_t itt_addr; u32 device_id; }; #define COLLECTION_NOT_MAPPED ((u32)~0) struct its_collection { struct list_head coll_list; u32 collection_id; u32 target_addr; }; #define its_is_collection_mapped(coll) ((coll) && \ ((coll)->target_addr != COLLECTION_NOT_MAPPED)) struct its_ite { struct list_head ite_list; struct vgic_irq *irq; struct its_collection *collection; u32 event_id; }; int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr); const struct vgic_register_region * vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, gpa_t addr, int len); struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid); struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_get_phys_line_level(struct vgic_irq *irq); void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags) __releases(&irq->irq_lock); void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, phys_addr_t addr, phys_addr_t alignment, phys_addr_t size); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v2_enable(struct kvm_vcpu *vcpu); int vgic_v2_probe(const struct gic_kvm_info *info); int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type); void vgic_v2_init_lrs(void); void vgic_v2_load(struct kvm_vcpu *vcpu); void vgic_v2_put(struct kvm_vcpu *vcpu); void vgic_v2_save_state(struct kvm_vcpu *vcpu); void vgic_v2_restore_state(struct kvm_vcpu *vcpu); static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq) { if (!irq) return false; if (irq->intid < VGIC_MIN_LPI) return true; return kref_get_unless_zero(&irq->refcount); } static inline void vgic_get_irq_kref(struct vgic_irq *irq) { WARN_ON_ONCE(!vgic_try_get_irq_kref(irq)); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_enable(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq); int vgic_v3_save_pending_tables(struct kvm *kvm); int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count); int vgic_register_redist_iodev(struct kvm_vcpu *vcpu); void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu); bool vgic_v3_check_base(struct kvm *kvm); void vgic_v3_load(struct kvm_vcpu *vcpu); void vgic_v3_put(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr, bool is_write); int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, u32 intid, u32 *val); int kvm_register_vgic_device(unsigned long type); void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); int vgic_lazy_init(struct kvm *kvm); int vgic_init(struct kvm *kvm); void vgic_debug_init(struct kvm *kvm); void vgic_debug_destroy(struct kvm *kvm); static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) { struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu; /* * num_pri_bits are initialized with HW supported values. * We can rely safely on num_pri_bits even if VM has not * restored ICC_CTLR_EL1 before restoring APnR registers. */ switch (cpu_if->num_pri_bits) { case 7: return 3; case 6: return 1; default: return 0; } } static inline bool vgic_v3_redist_region_full(struct vgic_redist_region *region) { if (!region->count) return false; return (region->free_index >= region->count); } struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs); static inline size_t vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg) { if (!rdreg->count) return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE; else return rdreg->count * KVM_VGIC_V3_REDIST_SIZE; } struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index); void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg); bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size) { struct vgic_dist *d = &kvm->arch.vgic; return (base + size > d->vgic_dist_base) && (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE); } bool vgic_lpis_enabled(struct kvm_vcpu *vcpu); int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, u32 devid, u32 eventid, struct vgic_irq **irq); struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi); void vgic_its_invalidate_all_caches(struct kvm *kvm); /* GICv4.1 MMIO interface */ int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq); int vgic_its_invall(struct kvm_vcpu *vcpu); bool vgic_supports_direct_msis(struct kvm *kvm); int vgic_v4_init(struct kvm *kvm); void vgic_v4_teardown(struct kvm *kvm); void vgic_v4_configure_vsgis(struct kvm *kvm); void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq); void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu); static inline bool kvm_has_gicv3(struct kvm *kvm) { return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP); } void vgic_v3_sync_nested(struct kvm_vcpu *vcpu); void vgic_v3_load_nested(struct kvm_vcpu *vcpu); void vgic_v3_put_nested(struct kvm_vcpu *vcpu); void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu); void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu); int vgic_its_debug_init(struct kvm_device *dev); void vgic_its_debug_destroy(struct kvm_device *dev); #endif
12 348 346 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAN_H #define _LINUX_MMAN_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/percpu_counter.h> #include <linux/atomic.h> #include <uapi/linux/mman.h> /* * Arrange for legacy / undefined architecture specific flags to be * ignored by mmap handling code. */ #ifndef MAP_32BIT #define MAP_32BIT 0 #endif #ifndef MAP_ABOVE4G #define MAP_ABOVE4G 0 #endif #ifndef MAP_HUGE_2MB #define MAP_HUGE_2MB 0 #endif #ifndef MAP_HUGE_1GB #define MAP_HUGE_1GB 0 #endif #ifndef MAP_UNINITIALIZED #define MAP_UNINITIALIZED 0 #endif #ifndef MAP_SYNC #define MAP_SYNC 0 #endif /* * The historical set of flags that all mmap implementations implicitly * support when a ->mmap_validate() op is not provided in file_operations. * * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the * kernel. */ #define LEGACY_MAP_MASK (MAP_SHARED \ | MAP_PRIVATE \ | MAP_FIXED \ | MAP_ANONYMOUS \ | MAP_DENYWRITE \ | MAP_EXECUTABLE \ | MAP_UNINITIALIZED \ | MAP_GROWSDOWN \ | MAP_LOCKED \ | MAP_NORESERVE \ | MAP_POPULATE \ | MAP_NONBLOCK \ | MAP_STACK \ | MAP_HUGETLB \ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ | MAP_HUGE_1GB) extern int sysctl_overcommit_memory; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP extern s32 vm_committed_as_batch; extern void mm_compute_batch(int overcommit_policy); #else #define vm_committed_as_batch 0 static inline void mm_compute_batch(int overcommit_policy) { } #endif unsigned long vm_memory_committed(void); static inline void vm_acct_memory(long pages) { percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); } static inline void vm_unacct_memory(long pages) { vm_acct_memory(-pages); } /* * Allow architectures to handle additional protection and flag bits. The * overriding macros must be defined in the arch-specific asm/mman.h file. */ #ifndef arch_calc_vm_prot_bits #define arch_calc_vm_prot_bits(prot, pkey) 0 #endif #ifndef arch_calc_vm_flag_bits #define arch_calc_vm_flag_bits(file, flags) 0 #endif #ifndef arch_validate_prot /* * This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have * already been masked out. * * Returns true if the prot flags are valid */ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0; } #define arch_validate_prot arch_validate_prot #endif #ifndef arch_validate_flags /* * This is called from mmap() and mprotect() with the updated vma->vm_flags. * * Returns true if the VM_* flags are valid. */ static inline bool arch_validate_flags(unsigned long flags) { return true; } #define arch_validate_flags arch_validate_flags #endif /* * Optimisation macro. It is equivalent to: * (x & bit1) ? bit2 : 0 * but this version is faster. * ("bit1" and "bit2" must be single bits) */ #define _calc_vm_trans(x, bit1, bit2) \ ((!(bit1) || !(bit2)) ? 0 : \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ : ((x) & (bit1)) / ((bit1) / (bit2)))) /* * Combine the mmap "prot" argument into "vm_flags" used internally. */ static inline unsigned long calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { return _calc_vm_trans(prot, PROT_READ, VM_READ ) | _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | arch_calc_vm_prot_bits(prot, pkey); } /* * Combine the mmap "flags" argument into "vm_flags" used internally. */ static inline unsigned long calc_vm_flag_bits(struct file *file, unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | arch_calc_vm_flag_bits(file, flags); } unsigned long vm_commit_limit(void); #ifndef arch_memory_deny_write_exec_supported static inline bool arch_memory_deny_write_exec_supported(void) { return true; } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif /* * Denies creating a writable executable mapping or gaining executable permissions. * * This denies the following: * * a) mmap(PROT_WRITE | PROT_EXEC) * * b) mmap(PROT_WRITE) * mprotect(PROT_EXEC) * * c) mmap(PROT_WRITE) * mprotect(PROT_READ) * mprotect(PROT_EXEC) * * But allows the following: * * d) mmap(PROT_READ | PROT_EXEC) * mmap(PROT_READ | PROT_EXEC | PROT_BTI) * * This is only applicable if the user has set the Memory-Deny-Write-Execute * (MDWE) protection mask for the current process. * * @old specifies the VMA flags the VMA originally possessed, and @new the ones * we propose to set. * * Return: false if proposed change is OK, true if not ok and should be denied. */ static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ if (!test_bit(MMF_HAS_MDWE, &current->mm->flags)) return false; /* If the new VMA is not executable, we have nothing to deny. */ if (!(new & VM_EXEC)) return false; /* Under MDWE we do not accept newly writably executable VMAs... */ if (new & VM_WRITE) return true; /* ...nor previously non-executable VMAs becoming executable. */ if (!(old & VM_EXEC)) return true; return false; } #endif /* _LINUX_MMAN_H */
52 52 53 53 52 53 1 1 1 1 20 23 23 1 5 5 5 5 5 5 5 1 5 26 26 27 27 24 21 5 1 5 21 1 20 20 23 23 23 23 22 22 23 23 23 23 21 21 2 2 2 4 26 27 27 2 2 2 2 2 25 25 1 1 1 1 1 1 1 1 1 2 2 32 2 1 3 30 2 2 1 25 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 7 26 33 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 // SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ /* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" #define DRV_VERSION "1.6" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/miscdevice.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> #include <linux/crc32.h> #include <linux/math.h> #include <linux/nsproxy.h> #include <linux/virtio_net.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/mutex.h> #include <linux/ieee802154.h> #include <uapi/linux/if_ltalk.h> #include <uapi/linux/if_fddi.h> #include <uapi/linux/if_hippi.h> #include <uapi/linux/if_fc.h> #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> #include <net/rps.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> #include "tun_vnet.h" static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ u32 mask[2]; /* Mask of the hashed addrs */ unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal * to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held. */ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; struct tun_page { struct page *page; int count; }; struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; struct tun_struct *tun; u32 rxhash; u32 rps_rxhash; int queue_index; unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1) struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. */ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4) int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; atomic_long_t rx_frame_errors; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; struct ifreq *ifr; }; struct veth { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; struct sk_buff *skb; int received = 0; __skb_queue_head_init(&process_queue); spin_lock(&queue->lock); skb_queue_splice_tail_init(queue, &process_queue); spin_unlock(&queue->lock); while (received < budget && (skb = __skb_dequeue(&process_queue))) { napi_gro_receive(napi, skb); ++received; } if (!skb_queue_empty(&process_queue)) { spin_lock(&queue->lock); skb_queue_splice(&process_queue, queue); spin_unlock(&queue->lock); } return received; } static int tun_napi_poll(struct napi_struct *napi, int budget) { unsigned int received; received = tun_napi_receive(napi, budget); if (received < budget) napi_complete_done(napi, received); return received; } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } static void tun_napi_enable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } static bool tun_napi_frags_enabled(const struct tun_file *tfile) { return tfile->napi_frags_enabled; } static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; } static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) { struct tun_flow_entry *e; hlist_for_each_entry_rcu(e, head, hash_link) { if (e->rxhash == rxhash) return e; } return NULL; } static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { netif_info(tun, tx_queued, tun->dev, "create flow: hash %u index %u\n", rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); ++tun->flow_count; } return e; } static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) tun_flow_delete(tun, e); } spin_unlock_bh(&tun->lock); } static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { if (e->queue_index == queue_index) tun_flow_delete(tun, e); } } spin_unlock_bh(&tun->lock); } static void tun_flow_cleanup(struct timer_list *t) { struct tun_struct *tun = from_timer(tun, t, flow_gc_timer); unsigned long delay = tun->ageing_time; unsigned long next_timer = jiffies + delay; unsigned long count = 0; int i; spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; this_timer = e->updated + delay; if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); continue; } count++; if (time_before(this_timer, next_timer)) next_timer = this_timer; } } if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index) WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) && tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + delay)); spin_unlock_bh(&tun->lock); } rcu_read_unlock(); } /* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; } /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); txq = e->queue_index; } else { txq = reciprocal_scale(txq, numqueues); } return txq; } static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_prog *prog; u32 numqueues; u16 ret = 0; numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0; prog = rcu_dereference(tun->steering_prog); if (prog) ret = bpf_prog_run_clear_cb(prog->prog, skb); return ret % numqueues; } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; rcu_read_lock(); if (rcu_dereference(tun->steering_prog)) ret = tun_ebpf_select_queue(tun, skb); else ret = tun_automq_select_queue(tun, skb); rcu_read_unlock(); return ret; } static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || (gid_valid(tun->group) && !in_egroup_p(tun->group))) && !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) { netif_set_real_num_tx_queues(tun->dev, tun->numqueues); netif_set_real_num_rx_queues(tun->dev, tun->numqueues); } static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile) { tfile->detached = tun; list_add_tail(&tfile->next, &tun->disabled); ++tun->numdisabled; } static struct tun_struct *tun_enable_queue(struct tun_file *tfile) { struct tun_struct *tun = tfile->detached; tfile->detached = NULL; list_del_init(&tfile->next); --tun->numdisabled; return tun; } void tun_ptr_free(void *ptr) { if (!ptr) return; if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } } EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { void *ptr; while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); } static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; struct tun_struct *tun; tun = rtnl_dereference(tfile->tun); if (tun && clean) { if (!tfile->detached) tun_napi_disable(tfile); tun_napi_del(tfile); } if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; ntfile->xdp_rxq.queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); --tun->numqueues; if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); } else { tun_disable_queue(tun, tfile); tun_napi_disable(tfile); } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); sock_put(&tfile->sk); } if (clean) { if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); } } static void tun_detach(struct tun_file *tfile, bool clean) { struct tun_struct *tun; struct net_device *dev; rtnl_lock(); tun = rtnl_dereference(tfile->tun); dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); if (dev) netdev_state_change(dev); rtnl_unlock(); if (clean) sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter, bool napi, bool napi_frags, bool publish_tun) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); if (err < 0) goto out; err = -EINVAL; if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; if (!tfile->detached && tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES) goto out; err = 0; /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { lock_sock(tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (!err) goto out; } if (!tfile->detached && ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free)) { err = -ENOMEM; goto out; } tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&tfile->xdp_rxq); goto out; } err = 0; } if (tfile->detached) { tun_enable_queue(tfile); tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } if (rtnl_dereference(tun->xdp_prog)) sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. */ /* Publish tfile->tun and tun->tfiles only after we've fully * initialized tfile; otherwise we risk using half-initialized * object. */ if (publish_tun) rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; tun_set_real_num_queues(tun); out: return err; } static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; rcu_read_lock(); tun = rcu_dereference(tfile->tun); if (tun) dev_hold(tun->dev); rcu_read_unlock(); return tun; } static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); } /* TAP filtering */ static void addr_hash_set(u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; mask[n >> 5] |= (1 << (n & 31)); } static unsigned int addr_hash_test(const u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; return mask[n >> 5] & (1 << (n & 31)); } static int update_filter(struct tap_filter *filter, void __user *arg) { struct { u8 u[ETH_ALEN]; } *addr; struct tun_filter uf; int err, alen, n, nexact; if (copy_from_user(&uf, arg, sizeof(uf))) return -EFAULT; if (!uf.count) { /* Disabled */ filter->count = 0; return 0; } alen = ETH_ALEN * uf.count; addr = memdup_user(arg + sizeof(uf), alen); if (IS_ERR(addr)) return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst * case we'll accept a few undesired packets. */ filter->count = 0; wmb(); /* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++) memcpy(filter->addr[n], addr[n].u, ETH_ALEN); nexact = n; /* Remaining multicast addresses are hashed, * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI)) memset(filter->mask, ~0, sizeof(filter->mask)); /* Now enable the filter */ wmb(); filter->count = nexact; /* Return the number of exact filters */ err = nexact; free_addr: kfree(addr); return err; } /* Returns: 0 - drop, !=0 - accept */ static int run_filter(struct tap_filter *filter, const struct sk_buff *skb) { /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect * at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i; /* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1; /* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest); return 0; } /* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept */ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) { if (!filter->count) return 1; return run_filter(filter, skb); } /* Network device part of the driver */ static const struct ethtool_ops tun_ethtool_ops; static int tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct ifreq *ifr = tun->ifr; int err; spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) return err; tun_flow_init(tun); dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, false); if (err < 0) { tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); return err; } return 0; } /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } /* Net device start xmit */ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e) tun_flow_save_rps_rxhash(e, rxhash); } #endif } static unsigned int run_ebpf_filter(struct tun_struct *tun, struct sk_buff *skb, int len) { struct tun_prog *prog = rcu_dereference(tun->filter_prog); if (prog) len = bpf_prog_run_clear_cb(prog->prog, skb); return len; } /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); enum skb_drop_reason drop_reason; int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ if (!tfile) { drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) { drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; } if (tfile->socket.sk->sk_filter && sk_filter(tfile->socket.sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; } len = run_ebpf_filter(tun, skb, len); if (len == 0) { drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; } if (pskb_trim(skb, len)) { drop_reason = SKB_DROP_REASON_NOMEM; goto drop; } if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. */ skb_orphan(skb); nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } /* dev->lltx requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) { /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us. */ } static netdev_features_t tun_net_fix_features(struct net_device *dev, netdev_features_t features) { struct tun_struct *tun = netdev_priv(dev); return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } static void tun_set_headroom(struct net_device *dev, int new_hr) { struct tun_struct *tun = netdev_priv(dev); if (new_hr < NET_SKB_PAD) new_hr = NET_SKB_PAD; tun->align = new_hr; } static void tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct tun_struct *tun = netdev_priv(dev); dev_get_tstats64(dev, stats); stats->rx_frame_errors += (unsigned long)atomic_long_read(&tun->rx_frame_errors); } static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; struct bpf_prog *old_prog; int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } list_for_each_entry(tfile, &tun->disabled, next) { if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } return 0; } static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) { struct tun_struct *tun = netdev_priv(dev); if (!tun->numqueues) return -EPERM; netif_carrier_on(dev); } else { netif_carrier_off(dev); } return 0; } static const struct net_device_ops tun_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) { /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); } static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; int nxmit = 0; int i; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); resample: numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); if (unlikely(!tfile)) goto resample; spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; } spin_unlock(&tfile->tx_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __tun_xdp_flush_tfile(tfile); rcu_read_unlock(); return nxmit; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); int nxmit; if (unlikely(!frame)) return -EOVERFLOW; nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH); if (!nxmit) xdp_return_frame_rx_napi(frame); return nxmit; } static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) { int i; for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); tun->ageing_time = TUN_FLOW_EXPIRE; timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0); mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) { timer_delete_sync(&tun->flow_gc_timer); tun_flow_flush(tun); } #define MIN_MTU 68 #define MAX_MTU 65535 /* Initialize net device. */ static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; break; case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; eth_hw_addr_random(dev); /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT; break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; } static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile) { struct sock *sk = tfile->socket.sk; return (tun->dev->flags & IFF_UP) && sock_writeable(sk); } /* Character device part */ /* Poll */ static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; __poll_t mask = 0; if (!tun) return EPOLLERR; sk = tfile->socket.sk; poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) mask |= EPOLLIN | EPOLLRDNORM; /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO. */ if (tun_sock_writeable(tun, tfile) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && tun_sock_writeable(tun, tfile))) mask |= EPOLLOUT | EPOLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) mask = EPOLLERR; tun_put(tun); return mask; } static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, size_t len, const struct iov_iter *it) { struct sk_buff *skb; size_t linear; int err; int i; if (it->nr_segs > MAX_SKB_FRAGS + 1 || len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); linear = iov_iter_single_seg_count(it); err = __skb_grow(skb, linear); if (err) goto free; skb->len = len; skb->data_len = len - linear; skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } frag = netdev_alloc_frag(fragsz); if (!frag) { err = -ENOMEM; goto free; } page = virt_to_head_page(frag); skb_fill_page_desc(skb, i - 1, page, frag - page_address(page), fragsz); } return skb; free: /* frees skb and all frags allocated with napi_alloc_frag() */ napi_free_frags(&tfile->napi); return ERR_PTR(err); } /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err); skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, int more) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; u32 rx_batched = tun->rx_batched; bool rcv = false; if (!rx_batched || (!more && skb_queue_empty(queue))) { local_bh_disable(); skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); return; } spin_lock(&queue->lock); if (!more || skb_queue_len(queue) == rx_batched) { __skb_queue_head_init(&process_queue); skb_queue_splice_tail_init(queue, &process_queue); rcv = true; } else { __skb_queue_tail(queue, skb); } spin_unlock(&queue->lock); if (rcv) { struct sk_buff *nskb; local_bh_disable(); while ((nskb = __skb_dequeue(&process_queue))) { skb_record_rx_queue(nskb, tfile->queue_index); netif_receive_skb(nskb); } skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); } } static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, int len, int noblock, bool zerocopy) { if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) return false; if (tfile->socket.sk->sk_sndbuf != INT_MAX) return false; if (!noblock) return false; if (zerocopy) return false; if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; } static struct sk_buff *__tun_build_skb(struct tun_file *tfile, struct page_frag *alloc_frag, char *buf, int buflen, int len, int pad, int metasize) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); skb_reserve(skb, pad); skb_put(skb, len); if (metasize) skb_metadata_set(skb, metasize); skb_set_owner_w(skb, tfile->socket.sk); get_page(alloc_frag->page); alloc_frag->offset += buflen; return skb; } static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, struct xdp_buff *xdp, u32 act) { int err; switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); if (err) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); if (err < 0) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: dev_core_stats_rx_dropped_inc(tun->dev); break; } return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { struct page_frag *alloc_frag = &current->task_frag; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; size_t copied; int pad = TUN_RX_PAD; int metasize = 0; int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return ERR_PTR(-EFAULT); /* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); } *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp_prepare_buff(&xdp, buf, pad, len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; } if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) goto out; pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; /* It is known that the xdp_buff was prepared with metadata * support, so the metasize will never be negative. */ metasize = xdp.data - xdp.data_meta; } bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); out: bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return NULL; } /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; int good_linear; int copylen; int hdr_len = 0; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); hdr_len = tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, from, &gso); if (hdr_len < 0) return hdr_len; len -= vnet_hdr_sz; } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { struct iov_iter i = *from; /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear); linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len; } else { if (!zerocopy) { copylen = len; linear = min(hdr_len, good_linear); } if (frags) { mutex_lock(&tfile->napi_mutex); skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter(). */ zerocopy = false; } else { if (!linear) linear = min_t(size_t, good_linear, copylen); skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } } if (tun_vnet_hdr_to_skb(tun->flags, skb, &gso)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: err = -EINVAL; goto drop; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); break; } /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; int ret; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); goto unlock_frags; } } rcu_read_unlock(); local_bh_enable(); } /* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize. */ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); rcu_read_lock(); if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (frags) { u32 headlen; /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); headlen = eth_get_headlen(tun->dev, skb->data, skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { WARN_ON_ONCE(1); err = -ENOMEM; dev_core_stats_rx_dropped_inc(tun->dev); napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); return err; } if (likely(napi_schedule_prep(&tfile->napi))) { local_bh_disable(); napi_gro_frags(&tfile->napi); napi_complete(&tfile->napi); local_bh_enable(); } else { err = -EBUSY; goto napi_busy; } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; spin_lock_bh(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; goto free_skb; } __skb_queue_tail(queue, skb); queue_len = skb_queue_len(queue); spin_unlock(&queue->lock); if (!more || queue_len > NAPI_POLL_WEIGHT) napi_schedule(&tfile->napi); local_bh_enable(); } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { netif_rx(skb); } rcu_read_unlock(); preempt_disable(); dev_sw_netstats_rx_add(tun->dev, len); preempt_enable(); if (rxhash) tun_flow_update(tun, rxhash, tfile); return total_len; drop: if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); unlock_frags: if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); } return err ?: total_len; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t result; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; result = tun_get_user(tun, tfile, NULL, from, noblock, false); tun_put(tun); return result; } static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; ssize_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); if (ret) return ret; } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); preempt_enable(); return ret; } /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; int ret; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; if (tun->flags & IFF_VNET_HDR) vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; } if (vnet_hdr_sz) { struct virtio_net_hdr gso; ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); if (ret) return ret; ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); if (ret) return ret; } if (vlan_hlen) { int ret; struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); return total; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); void *ptr = NULL; int error = 0; ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) goto out; if (noblock) { error = -EAGAIN; goto out; } add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; break; } if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { error = -EFAULT; break; } schedule(); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; if (!iov_iter_count(to)) { tun_ptr_free(ptr); return 0; } if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tun_do_read(tun, tfile, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; tun_put(tun); return ret; } static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->prog = prog; } spin_lock_bh(&tun->lock); old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_prog_free); return 0; } static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); tun->owner = INVALID_UID; tun->group = INVALID_GID; tun_default_link_ksettings(dev, &tun->link_ksettings); dev->ethtool_ops = &tun_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = tun_free_netdev; /* We prefer our own queue length */ dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap * device with netlink. */ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) { BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); return nla_total_size(sizeof(uid_t)) + /* OWNER */ nla_total_size(sizeof(gid_t)) + /* GROUP */ nla_total_size(sizeof(u8)) + /* TYPE */ nla_total_size(sizeof(u8)) + /* PI */ nla_total_size(sizeof(u8)) + /* VNET_HDR */ nla_total_size(sizeof(u8)) + /* PERSIST */ nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ 0; } static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) goto nla_put_failure; if (uid_valid(tun->owner) && nla_put_u32(skb, IFLA_TUN_OWNER, from_kuid_munged(current_user_ns(), tun->owner))) goto nla_put_failure; if (gid_valid(tun->group) && nla_put_u32(skb, IFLA_TUN_GROUP, from_kgid_munged(current_user_ns(), tun->group))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, !!(tun->flags & IFF_MULTI_QUEUE))) goto nla_put_failure; if (tun->flags & IFF_MULTI_QUEUE) { if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, tun->numdisabled)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, .get_size = tun_get_size, .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) { struct tun_file *tfile; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_sync_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); tfile = container_of(sk, struct tun_file, sk); kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static void tun_put_page(struct tun_page *tpage) { if (tpage->page) __page_frag_cache_drain(tpage->page, tpage->count); } static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, struct tun_page *tpage) { unsigned int datasize = xdp->data_end - xdp->data; struct tun_xdp_hdr *hdr = xdp->data_hard_start; struct virtio_net_hdr *gso = &hdr->gso; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; u32 rxhash = 0, act; int buflen = hdr->buflen; int metasize = 0; int ret = 0; bool skb_xdp = false; struct page *page; if (unlikely(datasize < ETH_HLEN)) return -EINVAL; xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { if (gso->gso_type) { skb_xdp = true; goto build; } xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { put_page(virt_to_head_page(xdp->data)); return ret; } switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; case XDP_TX: return 0; case XDP_PASS: break; default: page = virt_to_head_page(xdp->data); if (tpage->page == page) { ++tpage->count; } else { tun_put_page(tpage); tpage->page = page; tpage->count = 1; } return 0; } } build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { ret = -ENOMEM; goto out; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); /* The externally provided xdp_buff may have no metadata support, which * is marked by xdp->data_meta being xdp->data + 1. This will lead to a * metasize of -1 and is the reason why the condition checks for > 0. */ metasize = xdp->data - xdp->data_meta; if (metasize > 0) skb_metadata_set(skb, metasize); if (tun_vnet_hdr_to_skb(tun->flags, skb, gso)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; goto out; } skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); if (tfile->napi_enabled) { queue = &tfile->sk.sk_write_queue; spin_lock(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); kfree_skb(skb); return -EBUSY; } __skb_queue_tail(queue, skb); spin_unlock(&queue->lock); ret = 1; } else { netif_receive_skb(skb); ret = 0; } /* No need to disable preemption here since this function is * always called with bh disabled */ dev_sw_netstats_rx_add(tun->dev, datasize); if (rxhash) tun_flow_update(tun, rxhash, tfile); out: return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; if (!tun) return -EBADFD; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); if (ret > 0) queued += ret; } if (flush) xdp_do_flush(); if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); tun_put_page(&tpage); ret = total_len; goto out; } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); out: tun_put(tun); return ret; } static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out_put_tun; } if (flags & MSG_ERRQUEUE) { ret = sock_recv_errqueue(sock->sk, m, total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } out: tun_put(tun); return ret; out_put_tun: tun_put(tun); out_free: tun_ptr_free(ptr); return ret; } static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { return 0; } } static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun; int ret = 0; tun = tun_get(tfile); if (!tun) return 0; ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? sysfs_emit(buf, "%u\n", from_kuid_munged(current_user_ns(), tun->owner)) : sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? sysfs_emit(buf, "%u\n", from_kgid_munged(current_user_ns(), tun->group)) : sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); static DEVICE_ATTR_RO(owner); static DEVICE_ATTR_RO(group); static struct attribute *tun_dev_attrs[] = { &dev_attr_tun_flags.attr, &dev_attr_owner.attr, &dev_attr_group.attr, NULL }; static const struct attribute_group tun_attr_group = { .attrs = tun_dev_attrs }; static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; int err; if (tfile->detached) return -EINVAL; if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!(ifr->ifr_flags & IFF_NAPI) || (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) return -EINVAL; } dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) tun = netdev_priv(dev); else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) tun = netdev_priv(dev); else return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, true); if (err < 0) return err; if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. */ netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); netdev_state_change(dev); } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); tun->ifr = ifr; tun->file = file; tun_net_initialize(dev); err = register_netdevice(tun->dev); if (err < 0) { free_netdev(dev); return err; } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } if (ifr->ifr_flags & IFF_NO_CARRIER) netif_carrier_off(tun->dev); else netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strcpy(ifr->ifr_name, tun->dev->name); return 0; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { strcpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); } /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) { netdev_features_t features = 0; if (arg & TUN_F_CSUM) { features |= NETIF_F_HW_CSUM; arg &= ~TUN_F_CSUM; if (arg & (TUN_F_TSO4|TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) { features |= NETIF_F_TSO_ECN; arg &= ~TUN_F_TSO_ECN; } if (arg & TUN_F_TSO4) features |= NETIF_F_TSO; if (arg & TUN_F_TSO6) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } arg &= ~TUN_F_UFO; /* TODO: for now USO4 and USO6 should work simultaneously */ if (arg & TUN_F_USO4 && arg & TUN_F_USO6) { features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } } /* This gives the user a way to test for new features in future by * trying to set them. */ if (arg) return -EINVAL; tun->set_features = features; tun->dev->wanted_features &= ~TUN_USER_FEATURES; tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; } static void tun_detach_filter(struct tun_struct *tun, int n) { int i; struct tun_file *tfile; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); sk_detach_filter(tfile->socket.sk); release_sock(tfile->socket.sk); } tun->filter_attached = false; } static int tun_attach_filter(struct tun_struct *tun) { int i, ret = 0; struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; } } tun->filter_attached = true; return ret; } static void tun_set_sndbuf(struct tun_struct *tun) { struct tun_file *tfile; int i; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; int ret = 0; rtnl_lock(); if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; if (!tun) { ret = -EINVAL; goto unlock; } ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, tun->flags & IFF_NAPI_FRAGS, true); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; if (ret >= 0) netdev_state_change(tun->dev); unlock: rtnl_unlock(); return ret; } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; int fd; if (copy_from_user(&fd, data, sizeof(fd))) return -EFAULT; if (fd == -1) { prog = NULL; } else { prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) return PTR_ERR(prog); } return __tun_set_ebpf(tun, prog_p, prog); } /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { switch (type) { case ARPHRD_IP6GRE: case ARPHRD_TUNNEL6: return sizeof(struct in6_addr); case ARPHRD_IPGRE: case ARPHRD_TUNNEL: case ARPHRD_SIT: return 4; case ARPHRD_ETHER: return ETH_ALEN; case ARPHRD_IEEE802154: case ARPHRD_IEEE802154_MONITOR: return IEEE802154_EXTENDED_ADDR_LEN; case ARPHRD_PHONET_PIPE: case ARPHRD_PPP: case ARPHRD_NONE: return 0; case ARPHRD_6LOWPAN: return EUI64_ADDR_LEN; case ARPHRD_FDDI: return FDDI_K_ALEN; case ARPHRD_HIPPI: return HIPPI_ALEN; case ARPHRD_IEEE802: return FC_ALEN; case ARPHRD_ROSE: return ROSE_ADDR_LEN; case ARPHRD_NETROM: return AX25_ADDR_LEN; case ARPHRD_LOCALTLK: return LTALK_ALEN; default: return 0; } } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int ifindex; int sndbuf; int ret; bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { memset(&ifr, 0, sizeof(ifr)); } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); } rtnl_lock(); tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) goto unlock; ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(net, file, &ifr); if (ret) goto unlock; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; } if (cmd == TUNSETIFINDEX) { ret = -EPERM; if (tun) goto unlock; ret = -EFAULT; if (copy_from_user(&ifindex, argp, sizeof(ifindex))) goto unlock; ret = -EINVAL; if (ifindex < 0) goto unlock; ret = 0; tfile->ifindex = ifindex; goto unlock; } ret = -EBADFD; if (!tun) goto unlock; netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; if (!tfile->socket.sk->sk_filter) ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case TUNSETNOCSUM: /* Disable/Enable checksum */ /* [unimplemented] */ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); do_notify = true; } netif_info(tun, drv, tun->dev, "persist %s\n", arg ? "enabled" : "disabled"); break; case TUNSETOWNER: /* Set owner of the device */ owner = make_kuid(current_user_ns(), arg); if (!uid_valid(owner)) { ret = -EINVAL; break; } tun->owner = owner; do_notify = true; netif_info(tun, drv, tun->dev, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: /* Set group of the device */ group = make_kgid(current_user_ns(), arg); if (!gid_valid(group)) { ret = -EINVAL; break; } tun->group = group; do_notify = true; netif_info(tun, drv, tun->dev, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { netif_info(tun, drv, tun->dev, "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, tun->dev); ret = notifier_to_errno(ret); if (ret) { netif_info(tun, drv, tun->dev, "Refused to change device type\n"); break; } tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); netif_info(tun, drv, tun->dev, "linktype set to %d\n", tun->dev->type); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, tun->dev); } break; case TUNSETDEBUG: tun->msg_enable = (u32)arg; break; case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: /* Get hw address */ dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; case TUNSETSNDBUF: if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) { ret = -EFAULT; break; } if (sndbuf <= 0) { ret = -EINVAL; break; } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); break; case TUNGETFILTER: ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) break; ret = 0; break; case TUNSETSTEERINGEBPF: ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; case TUNSETFILTEREBPF: ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; case TUNSETCARRIER: ret = -EFAULT; if (copy_from_user(&carrier, argp, sizeof(carrier))) goto unlock; ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto unlock; ret = open_related_ns(&net->ns, get_net_ns); break; default: ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); break; } if (do_notify) netdev_state_change(tun->dev); unlock: rtnl_unlock(); if (tun) tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT static long tun_chr_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case TUNSETIFF: case TUNGETIFF: case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; default: arg = (compat_ulong_t)arg; break; } /* * compat_ifreq is shorter than ifreq, so we must not access beyond * the end of that structure. All fields that are used in this * driver are compatible though, we don't need to convert the * contents. */ return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */ static int tun_chr_fasync(int fd, struct file *file, int on) { struct tun_file *tfile = file->private_data; int ret; if (on) { ret = file_f_owner_allocate(file); if (ret) goto out; } if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; ret = 0; out: return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) { sk_free(&tfile->sk); return -ENOMEM; } mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); /* tun groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; tun_detach(tfile, true); return 0; } #ifdef CONFIG_PROC_FS static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); tun = tun_get(tfile); if (tun) tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) tun_put(tun); seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = tun_chr_compat_ioctl, #endif .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = tun_chr_show_fdinfo, #endif }; static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; /* ethtool interface */ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_zero_link_mode(cmd, advertising); cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; } static int tun_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(cmd, &tun->link_ksettings, sizeof(*cmd)); return 0; } static int tun_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(&tun->link_ksettings, cmd, sizeof(*cmd)); return 0; } static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct tun_struct *tun = netdev_priv(dev); strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } static u32 tun_get_msglevel(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { struct tun_struct *tun = netdev_priv(dev); tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); ec->rx_max_coalesced_frames = tun->rx_batched; return 0; } static int tun_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) tun->rx_batched = NAPI_POLL_WEIGHT; else tun->rx_batched = ec->rx_max_coalesced_frames; return 0; } static void tun_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct tun_struct *tun = netdev_priv(dev); channels->combined_count = tun->numqueues; channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; } static const struct ethtool_ops tun_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, .get_channels = tun_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_coalesce = tun_get_coalesce, .set_coalesce = tun_set_coalesce, .get_link_ksettings = tun_get_link_ksettings, .set_link_ksettings = tun_set_link_ksettings, }; static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free); kfree(rings); return ret; } static int tun_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; switch (event) { case NETDEV_CHANGE_TX_QUEUE_LEN: if (tun_queue_resize(tun)) return NOTIFY_BAD; break; case NETDEV_UP: for (i = 0; i < tun->numqueues; i++) { struct tun_file *tfile; tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_write_space(tfile->socket.sk); } break; default: break; } return NOTIFY_DONE; } static struct notifier_block tun_notifier_block __read_mostly = { .notifier_call = tun_device_event, }; static int __init tun_init(void) { int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); ret = rtnl_link_register(&tun_link_ops); if (ret) { pr_err("Can't register link_ops\n"); goto err_linkops; } ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } ret = register_netdevice_notifier(&tun_notifier_block); if (ret) { pr_err("Can't register netdevice notifier\n"); goto err_notifier; } return 0; err_notifier: misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: return ret; } static void __exit tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->tx_ring; } EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_MTE_H #define __ASM_MTE_H #include <asm/compiler.h> #include <asm/mte-def.h> #ifndef __ASSEMBLY__ #include <linux/bitfield.h> #include <linux/kasan-enabled.h> #include <linux/page-flags.h> #include <linux/sched.h> #include <linux/types.h> #include <asm/pgtable-types.h> void mte_clear_page_tags(void *addr); unsigned long mte_copy_tags_from_user(void *to, const void __user *from, unsigned long n); unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); int mte_save_tags(struct page *page); void mte_save_page_tags(const void *page_addr, void *tag_storage); void mte_restore_tags(swp_entry_t entry, struct page *page); void mte_restore_page_tags(void *page_addr, const void *tag_storage); void mte_invalidate_tags(int type, pgoff_t offset); void mte_invalidate_tags_area(int type); void *mte_allocate_tag_storage(void); void mte_free_tag_storage(char *storage); #ifdef CONFIG_ARM64_MTE /* track which pages have valid allocation tags */ #define PG_mte_tagged PG_arch_2 /* simple lock to avoid multiple threads tagging the same page */ #define PG_mte_lock PG_arch_3 static inline void set_page_mte_tagged(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * Ensure that the tags written prior to this function are visible * before the page flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &page->flags); } static inline bool page_mte_tagged(struct page *page) { bool ret = test_bit(PG_mte_tagged, &page->flags); VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); /* * If the page is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } /* * Lock the page for tagging and return 'true' if the page can be tagged, * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the * locking only happens once for page initialisation. * * The page MTE lock state: * * Locked: PG_mte_lock && !PG_mte_tagged * Unlocked: !PG_mte_lock || PG_mte_tagged * * Acquire semantics only if the page is tagged (returning 'false'). */ static inline bool try_page_mte_tagging(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); if (!test_and_set_bit(PG_mte_lock, &page->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); return false; } void mte_zero_clear_page_tags(void *addr); void mte_sync_tags(pte_t pte, unsigned int nr_pages); void mte_copy_page_tags(void *kto, const void *kfrom); void mte_thread_init_user(void); void mte_thread_switch(struct task_struct *next); void mte_cpu_setup(void); void mte_suspend_enter(void); void mte_suspend_exit(void); long set_mte_ctrl(struct task_struct *task, unsigned long arg); long get_mte_ctrl(struct task_struct *task); int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data); size_t mte_probe_user_range(const char __user *uaddr, size_t size); #else /* CONFIG_ARM64_MTE */ /* unused if !CONFIG_ARM64_MTE, silence the compiler */ #define PG_mte_tagged 0 static inline void set_page_mte_tagged(struct page *page) { } static inline bool page_mte_tagged(struct page *page) { return false; } static inline bool try_page_mte_tagging(struct page *page) { return false; } static inline void mte_zero_clear_page_tags(void *addr) { } static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages) { } static inline void mte_copy_page_tags(void *kto, const void *kfrom) { } static inline void mte_thread_init_user(void) { } static inline void mte_thread_switch(struct task_struct *next) { } static inline void mte_suspend_enter(void) { } static inline void mte_suspend_exit(void) { } static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg) { return 0; } static inline long get_mte_ctrl(struct task_struct *task) { return 0; } static inline int mte_ptrace_copy_tags(struct task_struct *child, long request, unsigned long addr, unsigned long data) { return -EIO; } #endif /* CONFIG_ARM64_MTE */ #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE) static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * Ensure that the tags written prior to this function are visible * before the folio flags update. */ smp_wmb(); set_bit(PG_mte_tagged, &folio->flags); } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { bool ret = test_bit(PG_mte_tagged, &folio->flags); VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); /* * If the folio is tagged, ensure ordering with a likely subsequent * read of the tags. */ if (ret) smp_rmb(); return ret; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); if (!test_and_set_bit(PG_mte_lock, &folio->flags)) return true; /* * The tags are either being initialised or may have been initialised * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); return false; } #else static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) { } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { return false; } static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { return false; } #endif static inline void mte_disable_tco_entry(struct task_struct *task) { if (!system_supports_mte()) return; /* * Re-enable tag checking (TCO set on exception entry). This is only * necessary if MTE is enabled in either the kernel or the userspace * task in synchronous or asymmetric mode (SCTLR_EL1.TCF0 bit 0 is set * for both). With MTE disabled in the kernel and disabled or * asynchronous in userspace, tag check faults (including in uaccesses) * are not reported, therefore there is no need to re-enable checking. * This is beneficial on microarchitectures where re-enabling TCO is * expensive. */ if (kasan_hw_tags_enabled() || (task->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT))) asm volatile(SET_PSTATE_TCO(0)); } #ifdef CONFIG_KASAN_HW_TAGS void mte_check_tfsr_el1(void); static inline void mte_check_tfsr_entry(void) { if (!kasan_hw_tags_enabled()) return; mte_check_tfsr_el1(); } static inline void mte_check_tfsr_exit(void) { if (!kasan_hw_tags_enabled()) return; /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() * is required. */ dsb(nsh); isb(); mte_check_tfsr_el1(); } #else static inline void mte_check_tfsr_el1(void) { } static inline void mte_check_tfsr_entry(void) { } static inline void mte_check_tfsr_exit(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* __ASSEMBLY__ */ #endif /* __ASM_MTE_H */
211 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 // SPDX-License-Identifier: GPL-2.0-only /* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * * 09Apr2002 Andrew Morton * Initial version. */ /** * DOC: Readahead Overview * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the * readahead flag set. This flag indicates that the folio was read * as part of a previous readahead request and now that it has been * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async * readahead. This is reflected in the struct file_ra_state which * contains ->size being the total number of pages, and ->async_size * which is the number of pages in the async section. The readahead * flag will be set on the first folio in this async section to trigger * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and * all readahead request will be fully asynchronous. * * When either of the triggers causes a readahead, three numbers need * to be determined: the start of the region to read, the size of the * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page * cache. This is found with a simple search in the page cache. * * The size of the async tail is determined by subtracting the size that * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new * readahead, though see get_next_ra_size() for details. * * If the size of the previous read cannot be determined, the number of * preceding pages in the page cache is used to estimate the size of * a previous read. This estimate could easily be misled by random * reads being coincidentally adjacent, so it is ignored unless it is * larger than the current request, and it is not scaled up, unless it * is at the start of file. * * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * * The above calculation, based on the previous readahead size, * determines the size of the readahead, to which any requested read * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * * ->readahead() will generally call readahead_folio() repeatedly to get * each folio from those prepared for readahead. It may fail to read a * folio by: * * * not calling readahead_folio() sufficiently many times, effectively * ignoring some folios, as might be appropriate if the path to * storage is congested. * * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * * In the last two cases, the folio should be unlocked by the filesystem * to indicate that the read attempt has failed. In the first case the * folio will be unlocked by the VFS. * * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. * In this case it is best to use filemap_remove_folio() to remove the * folios from the page cache as is automatically done for folios that * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using * ->read_folio() which may be less efficient. */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> #include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) return; if (unlikely(rac->_workingset)) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { aops->readahead(rac); /* Clean up the remaining folios. */ while ((folio = readahead_folio(rac)) != NULL) { folio_get(folio); filemap_remove_folio(folio); folio_unlock(folio); folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); if (unlikely(rac->_workingset)) psi_memstall_leave(&rac->_pflags); rac->_workingset = false; BUG_ON(readahead_count(rac)); } static struct folio *ractl_alloc_folio(struct readahead_control *ractl, gfp_t gfp_mask, unsigned int order) { struct folio *folio; folio = filemap_alloc_folio(gfp_mask, order); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); return folio; } /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * * This function is for filesystems to call when they want to start * readahead beyond a file's stated i_size. This is almost certainly * not the function you want to call. Use page_cache_async_readahead() * or page_cache_sync_readahead() instead. * * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long mark = ULONG_MAX, i = 0; unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted * them for I/O. Adding another page may need to allocate memory, * which can trigger memory reclaim. Telling the VM we're in * the middle of a filesystem operation will cause it to not * touch file-backed pages, preventing a deadlock. Most (all?) * filesystems already specify __GFP_NOFS in their mapping's * gfp_mask, but let's be explicit here. */ unsigned int nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); index = mapping_align_index(mapping, index); /* * As iterator `i` is aligned to min_nrpages, round_up the * difference between nr_to_read and lookahead_size to mark the * index that only has lookahead or "async_region" to set the * readahead flag. */ if (lookahead_size <= nr_to_read) { unsigned long ra_folio_index; ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, min_nrpages); mark = ra_folio_index - index; } nr_to_read += readahead_index(ractl) - index; ractl->_index = index; /* * Preallocate as many pages as we will need. */ while (i < nr_to_read) { struct folio *folio = xa_load(&mapping->i_pages, index + i); int ret; if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the * next batch. This page may be the one we would * have intended to mark as Readahead, but we don't * have a stable reference to this page, and it's * not worth getting one just for that. */ read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } folio = ractl_alloc_folio(ractl, gfp_mask, mapping_min_folio_order(mapping)); if (!folio) break; ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); if (ret < 0) { folio_put(folio); if (ret == -ENOMEM) break; read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } if (i == mark) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages += min_nrpages; i += min_nrpages; } /* * Now start the IO. We ignore I/O errors - if the folio is not * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; end_index = (isize - 1) >> PAGE_SHIFT; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages; if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; do_page_cache_ra(ractl, this_chunk, 0); nr_to_read -= this_chunk; } } /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* * Get the previous window size, ramp it up, and * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, unsigned long max) { unsigned long cur = ra->size; if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur; return max; } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * * |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead * * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; * Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. * * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead * page at (start+size-async_size) with PG_readahead, and use it as readahead * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) { folio_put(folio); return err; } ractl->_nr_pages += 1UL << order; ractl->_workingset |= folio_test_workingset(folio); return 0; } void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; pgoff_t start = readahead_index(ractl); pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping)); /* * Fallback when size < min_nrpages as each folio should be * at least min_nrpages anyway. */ if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size) goto fallback; limit = min(limit, index + ra->size - 1); if (new_order < mapping_max_folio_order(mapping)) new_order += 2; new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); /* * If the new_order is greater than min_order and index is * already aligned to new_order, then this will be noop as index * aligned to new_order should also be aligned to min_order. */ ractl->_index = mapping_align_index(mapping, index); index = readahead_index(ractl); while (index <= limit) { unsigned int order = new_order; /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ while (order > min_order && index + (1UL << order) - 1 > limit) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; index += 1UL << order; } read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this * situation below. */ if (!err) return; fallback: /* * ->readahead() may have updated readahead window size so we have to * check there's still something to read. */ if (ra->size > index - start) do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); unsigned long max_pages = ractl->ra->ra_pages; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages); return max_pages; } void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { pgoff_t index = readahead_index(ractl); bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); struct file_ra_state *ra = ractl->ra; unsigned long max_pages, contig_count; pgoff_t prev_index, miss; /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; do_forced_ra = true; } /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); return; } max_pages = ractl_max_pages(ractl, req_count); prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; /* * A start of file, oversized read, or sequential cache miss: * trivial case: (index - prev_index) == 1 * unaligned reads: (index - prev_index) == 0 */ if (!index || req_count > max_pages || index - prev_index <= 1UL) { ra->start = index; ra->size = get_init_ra_size(req_count, max_pages); ra->async_size = ra->size > req_count ? ra->size - req_count : ra->size >> 1; goto readit; } /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ rcu_read_lock(); miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages); rcu_read_unlock(); contig_count = index - miss - 1; /* * Standalone, small random read. Read as is, and do not pollute the * readahead state. */ if (contig_count <= req_count) { do_page_cache_ra(ractl, req_count, 0); return; } /* * File cached from the beginning: * it is a strong indication of long-run stream (or whole-file-read) */ if (miss == ULONG_MAX) contig_count *= 2; ra->start = index; ra->size = min(contig_count + req_count, max_pages); ra->async_size = 1; readit: ractl->_index = ra->start; page_cache_ra_order(ractl, ra, 0); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { unsigned long max_pages; struct file_ra_state *ra = ractl->ra; pgoff_t index = readahead_index(ractl); pgoff_t expected, start; unsigned int order = folio_order(folio); /* no readahead */ if (!ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (folio_test_writeback(folio)) return; folio_clear_readahead(folio); if (blk_cgroup_congested()) return; max_pages = ractl_max_pages(ractl, req_count); /* * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order); if (index == expected) { ra->start += ra->size; /* * In the case of MADV_HUGEPAGE, the actual size might exceed * the readahead window. */ ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); ra->async_size = ra->size; goto readit; } /* * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_count; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; readit: ractl->_index = ra->start; page_cache_ra_order(ractl, ra, order); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { CLASS(fd, f)(fd); if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) return -EBADF; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || (!S_ISREG(file_inode(fd_file(f))->i_mode) && !S_ISBLK(file_inode(fd_file(f))->i_mode))) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) { return ksys_readahead(fd, compat_arg_u64_glue(offset), count); } #endif /** * readahead_expand - Expand a readahead request * @ractl: The request to be expanded * @new_start: The revised start * @new_len: The revised size of the request * * Attempt to expand a readahead request outwards from the current size to the * specified size by inserting locked pages before and after the current window * to increase the size to the new window. This may involve the insertion of * THPs, in which case the window may get expanded even beyond what was * requested. * * The algorithm will stop if it encounters a conflicting page already in the * pagecache and leave a smaller expansion than requested. * * The caller must check for this by examining the revised @ractl object for a * different expansion than was requested. */ void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long min_nrpages = mapping_min_folio_nrpages(mapping); unsigned int min_order = mapping_min_folio_order(mapping); new_index = new_start / PAGE_SIZE; /* * Readahead code should have aligned the ractl->_index to * min_nrpages before calling readahead aops. */ VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages)); /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; if (ra) { ra->size += min_nrpages; ra->async_size += min_nrpages; } } } EXPORT_SYMBOL(readahead_expand);
187 187 187 187 187 186 161 161 203 203 202 203 203 203 27 186 187 187 186 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 // SPDX-License-Identifier: GPL-2.0-only /* * fs/fs-writeback.c * * Copyright (C) 2002, Linus Torvalds. * * Contains all the functions related to writing back and waiting * upon dirty inodes against superblocks, and writing back dirty * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * * 10Apr2002 Andrew Morton * Split out of fs/inode.c * Additions for address_space-based writeback */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> #include <linux/memcontrol.h> #include "internal.h" /* * 4MB minimal write chunk size */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) /* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct wb_completion *done; /* set if the caller waits */ }; /* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its * timestamps updated and when they finally get written out to be two * dirtytime_expire_intervals. We set the default to 12 hours (in * seconds), which means most of the time inodes will have their * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ static unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_io_list); } /* * Include the creation of the trace points after defining the * wb_writeback_work structure and inline functions so that the definition * remains local to this file. */ #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); static bool wb_io_lists_populated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb)) { return false; } else { set_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(!wb->avg_write_bandwidth); atomic_long_add(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth); return true; } } static void wb_io_lists_depopulated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { clear_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth) < 0); } } /** * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io|dirty_time} * * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); WARN_ON_ONCE(inode->i_state & I_FREEING); list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) return wb_io_lists_populated(wb); wb_io_lists_depopulated(wb); return false; } static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); spin_unlock_irq(&wb->work_lock); } /* * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just * set up a timer which wakes the bdi thread up later. * * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. * * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ static void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->dwork, timeout); spin_unlock_irq(&wb->work_lock); } static void finish_writeback_work(struct wb_writeback_work *work) { struct wb_completion *done = work->done; if (work->auto_free) kfree(work); if (done) { wait_queue_head_t *waitq = done->waitq; /* @done can't be accessed after the following dec */ if (atomic_dec_and_test(&done->cnt)) wake_up_all(waitq); } } static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { trace_writeback_queue(wb, work); if (work->done) atomic_inc(&work->done->cnt); spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) { list_add_tail(&work->list, &wb->work_list); mod_delayed_work(bdi_wq, &wb->dwork, 0); } else finish_writeback_work(work); spin_unlock_irq(&wb->work_lock); } /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion * * Wait for one or more work items issued to @bdi with their ->done field * set to @done, which should have been initialized with * DEFINE_WB_COMPLETION(). This function returns after all such work items * are completed. Work items which are waited upon aren't freed * automatically on completion. */ void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ wait_event(*done->waitq, !atomic_read(&done->cnt)); } #ifdef CONFIG_CGROUP_WRITEBACK /* * Parameters for foreign inode detection, see wbc_detach_inode() to see * how they're used. * * These paramters are inherently heuristical as the detection target * itself is fuzzy. All we want to do is detaching an inode from the * current owner if it's being written to by some other cgroups too much. * * The current cgroup writeback is built on the assumption that multiple * cgroups writing to the same inode concurrently is very rare and a mode * of operation which isn't well supported. As such, the goal is not * taking too long when a different cgroup takes over an inode while * avoiding too aggressive flip-flops from occasional foreign writes. * * We record, very roughly, 2s worth of IO time history and if more than * half of that is foreign, trigger the switch. The recording is quantized * to 16 slots. To avoid tiny writes from swinging the decision too much, * writes smaller than 1/8 of avg size are ignored. */ #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ #define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */ #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) /* each slot's duration is 2s / 16 */ #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) /* if foreign slots >= 8, switch */ #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ /* * Maximum inodes per isw. A specific value has been chosen to make * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. */ #define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ / sizeof(struct inode *)) static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; if (folio) { memcg_css = mem_cgroup_css_from_folio(folio); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); } } if (!wb) wb = &bdi->wb; /* * There may be multiple instances of this function racing to * update the same inode. Use cmpxchg() to tell the winner. */ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list * @inode: inode of interest with i_lock held * @wb: target bdi_writeback * * Remove the inode from wb's io lists and if necessarily put onto b_attached * list. Only inodes attached to cgwb's are kept on this list. */ static void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; if (wb != &wb->bdi->wb) list_move(&inode->i_io_list, &wb->b_attached); else list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * * Returns @inode's wb with its list_lock held. @inode->i_lock must be * held on entry and is released on return. The returned wb is guaranteed * to stay @inode's associated wb until its list_lock is released. */ static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) { while (true) { struct bdi_writeback *wb = inode_to_wb(inode); /* * inode_to_wb() association is protected by both * @inode->i_lock and @wb->list_lock but list_lock nests * outside i_lock. Drop i_lock and verify that the * association hasn't changed after acquiring list_lock. */ wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); /* i_wb may have changed inbetween, can't use inode_to_wb() */ if (likely(wb == inode->i_wb)) { wb_put(wb); /* @inode already has ref */ return wb; } spin_unlock(&wb->list_lock); wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } } /** * inode_to_wb_and_lock_list - determine an inode's wb and lock it * @inode: inode of interest * * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held * on entry. */ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { spin_lock(&inode->i_lock); return locked_inode_to_wb_and_lock_list(inode); } struct inode_switch_wbs_context { struct rcu_work work; /* * Multiple inodes can be switched at once. The switching procedure * consists of two parts, separated by a RCU grace period. To make * sure that the second part is executed for each inode gone through * the first part, all inode pointers are placed into a NULL-terminated * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ struct bdi_writeback *new_wb; struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { down_write(&bdi->wb_switch_rwsem); } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { up_write(&bdi->wb_switch_rwsem); } static bool inode_do_switch_wbs(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb) { struct address_space *mapping = inode->i_mapping; XA_STATE(xas, &mapping->i_pages, 0); struct folio *folio; bool switched = false; spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction * path owns the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to * folios actually under writeback. */ xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) { if (folio_test_dirty(folio)) { long nr = folio_nr_pages(folio); wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); } } xas_set(&xas, 0); xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { long nr = folio_nr_pages(folio); WARN_ON_ONCE(!folio_test_writeback(folio)); wb_stat_mod(old_wb, WB_WRITEBACK, -nr); wb_stat_mod(new_wb, WB_WRITEBACK, nr); } if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { atomic_dec(&old_wb->writeback_inodes); atomic_inc(&new_wb->writeback_inodes); } wb_get(new_wb); /* * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. * The transfer preserves @inode->dirtied_when ordering. If the @inode * was clean, it means it was on the b_attached list, so move it onto * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { struct inode *pos; list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode_cgwb_move_to_attached(inode, new_wb); } } else { inode->i_wb = new_wb; } /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; switched = true; skip_switch: /* * Paired with load_acquire in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); return switched; } static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; /* * If @inode switches cgwb membership while sync_inodes_sb() is * being issued, sync_inodes_sb() might miss it. Synchronize. */ down_read(&bdi->wb_switch_rwsem); /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be * synchronizing against the i_pages lock. * * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&new_wb->list_lock); spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } for (inodep = isw->inodes; *inodep; inodep++) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; } spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); up_read(&bdi->wb_switch_rwsem); if (nr_switched) { wb_wakeup(new_wb); wb_put_many(old_wb, nr_switched); } for (inodep = isw->inodes; *inodep; inodep++) iput(*inodep); wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); } static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { /* * Paired with smp_mb() in cgroup_writeback_umount(). * isw_nr_in_flight must be increased before checking SB_ACTIVE and * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 * in cgroup_writeback_umount() and the isw_wq will be not flushed. */ smp_mb(); if (IS_DAX(inode)) return false; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); return false; } inode->i_state |= I_WB_SWITCH; __iget(inode); spin_unlock(&inode->i_lock); return true; } /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode * @new_wb_id: ID of the new wb * * Switch @inode's wb association to the wb identified by @new_wb_id. The * switching is performed asynchronously and may fail silently. */ static void inode_switch_wbs(struct inode *inode, int new_wb_id) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) return; /* avoid queueing a new switch if too many are already in flight */ if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC); if (!isw) return; atomic_inc(&isw_nr_in_flight); /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); if (memcg_css && !css_tryget(memcg_css)) memcg_css = NULL; rcu_read_unlock(); if (!memcg_css) goto out_free; isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); if (!isw->new_wb) goto out_free; if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); queue_rcu_work(isw_wq, &isw->work); return; out_free: atomic_dec(&isw_nr_in_flight); if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); } static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { if (!inode_prepare_wbs_switch(inode, isw->new_wb)) continue; isw->inodes[*nr] = inode; (*nr)++; if (*nr >= WB_MAX_INODES_PER_ISW - 1) return true; } return false; } /** * cleanup_offline_cgwb - detach associated inodes * @wb: target wb * * Switch all inodes attached to @wb to a nearest living ancestor's wb in order * to eventually release the dying @wb. Returns %true if not all inodes were * switched and the function has to be restarted. */ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; int nr; bool restart = false; isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW), GFP_KERNEL); if (!isw) return restart; atomic_inc(&isw_nr_in_flight); for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); if (isw->new_wb) break; } if (unlikely(!isw->new_wb)) isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); /* * In addition to the inodes that have completed writeback, also switch * cgwbs for those inodes only with dirty timestamps. Otherwise, those * inodes won't be written back for a long time when lazytime is * enabled, and thus pinning the dying cgwbs. It won't break the * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); if (!restart) restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); wb_put(isw->new_wb); kfree(isw); return restart; } /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); queue_rcu_work(isw_wq, &isw->work); return restart; } /** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest * @inode: target inode * * @inode is locked and about to be written back under the control of @wbc. * Record @inode's writeback context into @wbc and unlock the i_lock. On * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); return; } wbc->wb = inode_to_wb(inode); wbc->inode = inode; wbc->wb_id = wbc->wb->memcg_css->id; wbc->wb_lcand_id = inode->i_wb_frn_winner; wbc->wb_tcand_id = 0; wbc->wb_bytes = 0; wbc->wb_lcand_bytes = 0; wbc->wb_tcand_bytes = 0; wb_get(wbc->wb); spin_unlock(&inode->i_lock); /* * A dying wb indicates that either the blkcg associated with the * memcg changed or the associated memcg is dying. In the first * case, a replacement wb should already be available and we should * refresh the wb immediately. In the second case, trying to * refresh will keep failing. */ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } /** * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite * @wbc: writeback_control of interest * @inode: target inode * * This function is to be used by __filemap_fdatawrite_range(), which is an * alternative entry point into writeback code, and first ensures @inode is * associated with a bdi_writeback and attaches it to @wbc. */ void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { spin_lock(&inode->i_lock); inode_attach_wb(inode, NULL); wbc_attach_and_unlock_inode(wbc, inode); } EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection * @wbc: writeback_control of the just finished writeback * * To be called after a writeback attempt of an inode finishes and undoes * wbc_attach_and_unlock_inode(). Can be called under any context. * * As concurrent write sharing of an inode is expected to be very rare and * memcg only tracks page ownership on first-use basis severely confining * the usefulness of such sharing, cgroup writeback tracks ownership * per-inode. While the support for concurrent write sharing of an inode * is deemed unnecessary, an inode being written to by different cgroups at * different points in time is a lot more common, and, more importantly, * charging only by first-use can too readily lead to grossly incorrect * behaviors (single foreign page can lead to gigabytes of writeback to be * incorrectly attributed). * * To resolve this issue, cgroup writeback detects the majority dirtier of * an inode and transfers the ownership to it. To avoid unnecessary * oscillation, the detection mechanism keeps track of history and gives * out the switch verdict only if the foreign usage pattern is stable over * a certain amount of time and/or writeback attempts. * * On each writeback attempt, @wbc tries to detect the majority writer * using Boyer-Moore majority vote algorithm. In addition to the byte * count from the majority voting, it also counts the bytes written for the * current wb and the last round's winner wb (max of last round's current * wb, the winner from two rounds ago, and the last round's majority * candidate). Keeping track of the historical winner helps the algorithm * to semi-reliably detect the most active writer even when it's not the * absolute majority. * * Once the winner of the round is determined, whether the winner is * foreign or not and how much IO time the round consumed is recorded in * inode->i_wb_frn_history. If the amount of recorded foreign IO time is * over a certain threshold, the switch verdict is given. */ void wbc_detach_inode(struct writeback_control *wbc) { struct bdi_writeback *wb = wbc->wb; struct inode *inode = wbc->inode; unsigned long avg_time, max_bytes, max_time; u16 history; int max_id; if (!wb) return; history = inode->i_wb_frn_history; avg_time = inode->i_wb_frn_avg_time; /* pick the winner of this round */ if (wbc->wb_bytes >= wbc->wb_lcand_bytes && wbc->wb_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_id; max_bytes = wbc->wb_bytes; } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_lcand_id; max_bytes = wbc->wb_lcand_bytes; } else { max_id = wbc->wb_tcand_id; max_bytes = wbc->wb_tcand_bytes; } /* * Calculate the amount of IO time the winner consumed and fold it * into the running average kept per inode. If the consumed IO * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for * deciding whether to switch or not. This is to prevent one-off * small dirtiers from skewing the verdict. */ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, wb->avg_write_bandwidth); if (avg_time) avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - (avg_time >> WB_FRN_TIME_AVG_SHIFT); else avg_time = max_time; /* immediate catch up on first run */ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { int slots; /* * The switch verdict is reached if foreign wb's consume * more than a certain proportion of IO time in a * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot * history mask where each bit represents one sixteenth of * the period. Determine the number of slots to shift into * history from @max_time. */ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), (unsigned long)WB_FRN_HIST_MAX_SLOTS); history <<= slots; if (wbc->wb_id != max_id) history |= (1U << slots) - 1; if (history) trace_inode_foreign_history(inode, wbc, history); /* * Switch if the current wb isn't the consistent winner. * If there are multiple closely competing dirtiers, the * inode may switch across them repeatedly over time, which * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ if (hweight16(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } /* * Multiple instances of this function may race to update the * following fields but we don't mind occassional inaccuracies. */ inode->i_wb_frn_winner = max_id; inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); inode->i_wb_frn_history = history; wb_put(wbc->wb); wbc->wb = NULL; } EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress * @folio: folio being written out * @bytes: number of bytes being written out * * @bytes from @folio are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { struct cgroup_subsys_state *css; int id; /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (!wbc->wb || wbc->no_cgroup_owner) return; css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) return; id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; return; } if (id == wbc->wb_lcand_id) wbc->wb_lcand_bytes += bytes; /* Boyer-Moore majority vote algorithm */ if (!wbc->wb_tcand_bytes) wbc->wb_tcand_id = id; if (id == wbc->wb_tcand_id) wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi * * Split @wb's portion of @nr_pages according to @wb's write bandwidth in * relation to the total write bandwidth of all wb's w/ dirty inodes on * @wb->bdi. */ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); if (nr_pages == LONG_MAX) return LONG_MAX; /* * This may be called on clean wb's and proportional distribution * may not make sense, just use the original @nr_pages in those * cases. In general, we wanna err on the side of writing more. */ if (!tot_bw || this_bw >= tot_bw) return nr_pages; else return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); } /** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue * @skip_if_busy: skip wb's which already have writeback in progress * * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's * distributed to the busy wbs according to each wb's proportion in the * total active write bandwidth of @bdi. */ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { struct bdi_writeback *last_wb = NULL; struct bdi_writeback *wb = list_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); might_sleep(); restart: rcu_read_lock(); list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { DEFINE_WB_COMPLETION(fallback_work_done, bdi); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; if (last_wb) { wb_put(last_wb); last_wb = NULL; } /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || list_empty(&wb->b_dirty_time))) continue; if (skip_if_busy && writeback_in_progress(wb)) continue; nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 1; wb_queue_work(wb, work); continue; } /* * If wb_tryget fails, the wb has been shutdown, skip it. * * Pin @wb so that it stays on @bdi->wb_list. This allows * continuing iteration from @wb after dropping and * regrabbing rcu read lock. */ if (!wb_tryget(wb)) continue; /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 0; work->done = &fallback_work_done; wb_queue_work(wb, work); last_wb = wb; rcu_read_unlock(); wb_wait_for_completion(&fallback_work_done); goto restart; } rcu_read_unlock(); if (last_wb) wb_put(last_wb); } /** * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; unsigned long dirty; int ret; /* lookup bdi and memcg */ bdi = bdi_get_by_id(bdi_id); if (!bdi) return -ENOENT; rcu_read_lock(); memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys); if (memcg_css && !css_tryget(memcg_css)) memcg_css = NULL; rcu_read_unlock(); if (!memcg_css) { ret = -ENOENT; goto out_bdi_put; } /* * And find the associated wb. If the wb isn't there already * there's nothing to flush, don't create one. */ wb = wb_get_lookup(bdi, memcg_css); if (!wb) { ret = -ENOENT; goto out_css_put; } /* * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. * * BTW the memcg stats are flushed periodically and this is best-effort * estimation, so some potential error is ok. */ dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; work->done = done; work->auto_free = 1; wb_queue_work(wb, work); ret = 0; } else { ret = -ENOMEM; } wb_put(wb); out_css_put: css_put(memcg_css); out_bdi_put: bdi_put(bdi); return ret; } /** * cgroup_writeback_umount - flush inode wb switches for umount * @sb: target super_block * * This function is called when a super_block is about to be destroyed and * flushes in-flight inode wb switches. An inode wb switch goes through * RCU and then workqueue, so the two need to be flushed in order to ensure * that all previously scheduled switches are finished. As wb switches are * rare occurrences and synchronize_rcu() can take a while, perform * flushing iff wb switches are in flight. */ void cgroup_writeback_umount(struct super_block *sb) { if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) return; /* * SB_ACTIVE should be reliably cleared before checking * isw_nr_in_flight, see generic_shutdown_super(). */ smp_mb(); if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to * ensure that all in-flight wb switches are in the workqueue. */ rcu_barrier(); flush_workqueue(isw_wq); } } static int __init cgroup_writeback_init(void) { isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); if (!isw_wq) return -ENOMEM; return 0; } fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); WARN_ON_ONCE(inode->i_state & I_FREEING); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) { struct bdi_writeback *wb = inode_to_wb(inode); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); return wb; } static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { struct bdi_writeback *wb = inode_to_wb(inode); spin_lock(&wb->list_lock); return wb; } static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { return nr_pages; } static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { might_sleep(); if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; wb_queue_work(&bdi->wb, base_work); } } static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock) { spin_unlock(&inode->i_lock); } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * Add in the number of potentially dirty inodes, because each inode * write can dirty pagecache in the underlying blockdev. */ static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + get_nr_dirty_inodes(); } static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) { if (!wb_has_dirty_io(wb)) return; /* * All callers of this function want to start writeback of all * dirty pages. Places like vmscan can call this at a very * high frequency, causing pointless allocations of tons of * work items and keeping the flusher threads busy retrieving * that work. Ensure that we only allow one of them pending and * inflight at the time. */ if (test_bit(WB_start_all, &wb->state) || test_and_set_bit(WB_start_all, &wb->state)) return; wb->start_all_reason = reason; wb_wakeup(wb); } /** * wb_start_background_writeback - start background writeback * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(wb); wb_wakeup(wb); } /* * Remove the inode from the writeback list it is on. */ void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb */ void sb_mark_inode_writeback(struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned long flags; if (list_empty(&inode->i_wb_list)) { spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); if (list_empty(&inode->i_wb_list)) { list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); trace_sb_mark_inode_writeback(inode); } spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); } } /* * clear an inode as under writeback on the sb */ void sb_clear_inode_writeback(struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned long flags; if (!list_empty(&inode->i_wb_list)) { spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); if (!list_empty(&inode->i_wb_list)) { list_del_init(&inode->i_wb_list); trace_sb_clear_inode_writeback(inode); } spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); } } /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED; /* * When the inode is being freed just don't bother with dirty list * tracking. Flush worker will ignore this inode anyway and it will * trigger assertions in inode_io_list_move_locked(). */ if (inode->i_state & I_FREEING) { list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); return; } if (!list_empty(&wb->b_dirty)) { struct inode *tail; tail = wb_inode(wb->b_dirty.next); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); } static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { spin_lock(&inode->i_lock); redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); } /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { assert_spin_locked(&inode->i_lock); inode->i_state &= ~I_SYNC; /* If inode is clean an unused, put it into LRU now... */ inode_add_lru(inode); /* Called with inode->i_lock which ensures memory ordering. */ inode_wake_up_bit(inode, __I_SYNC); } static bool inode_dirtied_after(struct inode *inode, unsigned long t) { bool ret = time_after(inode->dirtied_when, t); #ifndef CONFIG_64BIT /* * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif return ret; } /* * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, unsigned long dirtied_before) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); if (inode_dirtied_after(inode, dirtied_before)) break; spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; inode->i_state |= I_SYNC_QUEUED; spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); goto out; } /* * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', * we don't take inode->i_lock here because it is just a pointless overhead. * Inode is already marked as I_SYNC_QUEUED so writeback list handling is * fully under our control. */ while (!list_empty(&tmp)) { sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) list_move(&inode->i_io_list, dispatch_queue); } } out: return moved; } /* * Queue all expired dirty inodes for io, eldest first. * Before * newly dirtied b_dirty b_io b_more_io * =============> gf edc BA * After * newly dirtied b_dirty b_io b_more_io * =============> g fBAedc * | * +--> dequeue for IO */ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before) { int moved; unsigned long time_expire_jif = dirtied_before; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); if (!work->for_sync) time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, time_expire_jif); if (moved) wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, dirtied_before, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { trace_writeback_write_inode_start(inode, wbc); ret = inode->i_sb->s_op->write_inode(inode, wbc); trace_writeback_write_inode(inode, wbc); return ret; } return 0; } /* * Wait for writeback on an inode to complete. Called with i_lock held. * Caller must make sure inode cannot go away when we drop i_lock. */ void inode_wait_for_writeback(struct inode *inode) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; assert_spin_locked(&inode->i_lock); if (!(inode->i_state & I_SYNC)) return; wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); for (;;) { prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ if (!(inode->i_state & I_SYNC)) break; spin_unlock(&inode->i_lock); schedule(); spin_lock(&inode->i_lock); } finish_wait(wq_head, &wqe.wq_entry); } /* * Sleep until I_SYNC is cleared. This function must be called with i_lock * held and drops it. It is aimed for callers not holding any inode reference * so once i_lock is dropped, inode can go away. */ static void inode_sleep_on_writeback(struct inode *inode) __releases(inode->i_lock) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; bool sleep; assert_spin_locked(&inode->i_lock); wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ sleep = !!(inode->i_state & I_SYNC); spin_unlock(&inode->i_lock); if (sleep) schedule(); finish_wait(wq_head, &wqe.wq_entry); } /* * Find proper writeback list for the inode depending on its current state and * possibly also change of its state while we were doing writeback. Here we * handle things such as livelock prevention or fairness of writeback among * inodes. This function can be called only by flusher thread - noone else * processes all inodes in writeback lists and requeueing inodes behind flusher * thread's back can have unexpected consequences. */ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc, unsigned long dirtied_before) { if (inode->i_state & I_FREEING) return; /* * Sync livelock prevention. Each inode is tagged and synced in one * shot. If still dirty, it will be redirty_tail()'ed below. Update * the dirty time to prevent enqueue and sync it again. */ if ((inode->i_state & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; if (wbc->pages_skipped) { /* * Writeback is not making progress due to locked buffers. * Skip this inode for now. Although having skipped pages * is odd for clean inodes, it can happen for some * filesystems so handle that gracefully. */ if (inode->i_state & I_DIRTY_ALL) redirty_tail_locked(inode, wb); else inode_cgwb_move_to_attached(inode, wb); return; } if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. */ if (wbc->nr_to_write <= 0 && !inode_dirtied_after(inode, dirtied_before)) { /* Slice used up. Queue for next turn. */ requeue_io(inode, wb); } else { /* * Writeback blocked by something other than * congestion. Delay the inode for some time to * avoid spinning on the CPU (100% iowait) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ redirty_tail_locked(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* * Filesystems can dirty the inode during writeback operations, * such as delayed allocation during submission or metadata * updates after data IO completion. */ redirty_tail_locked(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_cgwb_move_to_attached(inode, wb); } } /* * Write out an inode and its dirty pages (or some of its dirty pages, depending * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state. * * This doesn't remove the inode from the writeback list it is on, except * potentially to move it from b_dirty_time to b_dirty due to timestamp * expiration. The caller is otherwise responsible for writeback list handling. * * The caller is also responsible for setting the I_SYNC flag beforehand and * calling inode_sync_complete() to clear it afterwards. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; WARN_ON(!(inode->i_state & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); ret = do_writepages(mapping, wbc); /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data * I/O completion. We don't do it for sync(2) writeback because it has a * separate, external IO completion path and ->sync_fs for guaranteeing * inode metadata is written back correctly. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } /* * If the inode has dirty timestamps and we need to write them, call * mark_inode_dirty_sync() to notify the filesystem about it and to * change I_DIRTY_TIME into I_DIRTY_SYNC. */ if ((inode->i_state & I_DIRTY_TIME) && (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { trace_writeback_lazytime(inode); mark_inode_dirty_sync(inode); } /* * Get and clear the dirty flags from i_state. This needs to be done * after calling writepages because some filesystems may redirty the * inode during writepages due to delalloc. It also needs to be done * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~dirty; /* * Paired with smp_mb() in __mark_inode_dirty(). This allows * __mark_inode_dirty() to test i_state without grabbing i_lock - * either they see the I_DIRTY bits cleared or we see the dirtied * inode. * * I_DIRTY_PAGES is always cleared together above even if @mapping * still has dirty pages. The flag is reinstated after smp_mb() if * necessary. This guarantees that either __mark_inode_dirty() * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. */ smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) { if (!(inode->i_state & I_DIRTY_PAGES)) { inode->i_state &= ~I_PINNING_NETFS_WB; wbc->unpinned_netfs_wb = true; dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */ } } spin_unlock(&inode->i_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; } wbc->unpinned_netfs_wb = false; trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } /* * Write out an inode's dirty data and metadata on-demand, i.e. separately from * the regular batched writeback done by the flusher threads in * writeback_sb_inodes(). @wbc controls various aspects of the write, such as * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE). * * To prevent the inode from going away, either the caller must have a reference * to the inode, or the inode must have I_WILL_FREE or I_FREEING set. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) { /* * Writeback is already running on the inode. For WB_SYNC_NONE, * that's enough and we can just return. For WB_SYNC_ALL, we * must wait for the existing writeback to complete, then do * writeback again if there's anything left. */ if (wbc->sync_mode != WB_SYNC_ALL) goto out; inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* * If the inode is already fully clean, then there's nothing to do. * * For data-integrity syncs we also need to check whether any pages are * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If * there are any such pages, we'll need to wait for them. */ if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If the inode is freeing, its i_io_list shoudn't be updated * as it can be finally deleted at this moment. */ if (!(inode->i_state & I_FREEING)) { /* * If the inode is now fully clean, then it can be safely * removed from its writeback list (if any). Otherwise the * flusher threads are responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); else if (!(inode->i_state & I_SYNC_QUEUED)) { if ((inode->i_state & I_DIRTY)) redirty_tail_locked(inode, wb); else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } } } spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: spin_unlock(&inode->i_lock); return ret; } static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; /* * WB_SYNC_ALL mode does livelock avoidance by syncing dirty * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX * here avoids calling into writeback_inodes_wb() more than once. * * The intended call sequence for WB_SYNC_ALL writeback is: * * wb_writeback() * writeback_sb_inodes() <== called only once * write_cache_pages() <== called once for each inode * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); } return pages; } /* * Write a portion of b_io inodes which belong to @sb. * * Return the number of pages and/or inodes written. * * NOTE! This is called with wb->list_lock held, and will * unlock and relock that for each inode it ends up doing * IO for. */ static long writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, struct wb_writeback_work *work) { struct writeback_control wbc = { .sync_mode = work->sync_mode, .tagged_writepages = work->tagged_writepages, .for_kupdate = work->for_kupdate, .for_background = work->for_background, .for_sync = work->for_sync, .range_cyclic = work->range_cyclic, .range_start = 0, .range_end = LLONG_MAX, }; unsigned long start_time = jiffies; long write_chunk; long total_wrote = 0; /* count both pages and inodes */ unsigned long dirtied_before = jiffies; if (work->for_kupdate) dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; long wrote; if (inode->i_sb != sb) { if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ redirty_tail(inode, wb); continue; } /* * The inode belongs to a different superblock. * Bounce back to the caller to unpin this and * pin the next superblock. */ break; } /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to * b_more_io so that writeback can proceed with the * other inodes on s_io. * * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ requeue_io(inode, wb); spin_unlock(&inode->i_lock); trace_writeback_sb_inodes_requeue(inode); continue; } spin_unlock(&wb->list_lock); /* * We already requeued the inode if it had I_SYNC set and we * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ if (inode->i_state & I_SYNC) { /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; /* * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */ __writeback_single_inode(inode, &wbc); wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; wrote = wrote < 0 ? 0 : wrote; total_wrote += wrote; if (need_resched()) { /* * We're trying to balance between building up a nice * long list of IOs to improve our merge rate, and * getting those IOs out quickly for anyone throttling * in balance_dirty_pages(). cond_resched() doesn't * unplug, so get our IOs out the door before we * give up the CPU. */ blk_flush_plug(current->plug, false); cond_resched(); } /* * Requeue @inode if still dirty. Be careful as @inode may * have been switched to another wb in the meantime. */ tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) total_wrote++; requeue_inode(inode, tmp_wb, &wbc, dirtied_before); inode_sync_complete(inode); spin_unlock(&inode->i_lock); if (unlikely(tmp_wb != wb)) { spin_unlock(&tmp_wb->list_lock); spin_lock(&wb->list_lock); } /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } return total_wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, struct wb_writeback_work *work) { unsigned long start_time = jiffies; long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!super_trylock_shared(sb)) { /* * super_trylock_shared() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ redirty_tail(inode, wb); continue; } wrote += writeback_sb_inodes(sb, wb, work); up_read(&sb->s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } /* Leave any unwritten inodes on b_io */ return wrote; } static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = reason, }; struct blk_plug plug; blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, &work, jiffies); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work.nr_pages; } /* * Explicit flushing or periodic writeback of "old" data. * * Define "old": the first time one of an inode's pages is dirtied, we mark the * dirtying-time in the inode's address_space. So this periodic writeback code * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * * Try to run once per dirty_writeback_interval. But if a writeback event * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * dirtied_before takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { long nr_pages = work->nr_pages; unsigned long dirtied_before = jiffies; struct inode *inode; long progress; struct blk_plug plug; bool queued = false; blk_start_plug(&plug); for (;;) { /* * Stop writeback when nr_pages has been consumed */ if (work->nr_pages <= 0) break; /* * Background writeout and kupdate-style writeback may * run forever. Stop them if there is other work to do * so that e.g. sync can proceed. They'll be restarted * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ if (work->for_background && !wb_over_bg_thresh(wb)) break; spin_lock(&wb->list_lock); trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) { /* * Kupdate and background works are special and we want * to include all inodes that need writing. Livelock * avoidance is handled by these works yielding to any * other work so we are safe. */ if (work->for_kupdate) { dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) dirtied_before = jiffies; queue_io(wb, work, dirtied_before); queued = true; } if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); /* * Did we write something? Try for more * * Dirty inodes are moved to b_io for writeback in batches. * The completion of the current batch does not necessarily * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ if (progress || !queued) { spin_unlock(&wb->list_lock); continue; } /* * No more inodes for IO, bail */ if (list_empty(&wb->b_more_io)) { spin_unlock(&wb->list_lock); break; } /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); /* This function drops i_lock... */ inode_sleep_on_writeback(inode); } blk_finish_plug(&plug); return nr_pages - work->nr_pages; } /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; spin_lock_irq(&wb->work_lock); if (!list_empty(&wb->work_list)) { work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } spin_unlock_irq(&wb->work_lock); return work; } static long wb_check_background_flush(struct bdi_writeback *wb) { if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); } return 0; } static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; long nr_pages; /* * When set to zero, disable periodic writeback */ if (!dirty_writeback_interval) return 0; expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) return 0; wb->last_old_flush = jiffies; nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); } return 0; } static long wb_check_start_all(struct bdi_writeback *wb) { long nr_pages; if (!test_bit(WB_start_all, &wb->state)) return 0; nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { .nr_pages = wb_split_bdi_pages(wb, nr_pages), .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = wb->start_all_reason, }; nr_pages = wb_writeback(wb, &work); } clear_bit(WB_start_all, &wb->state); return nr_pages; } /* * Retrieve work items and do the writeback they describe */ static long wb_do_writeback(struct bdi_writeback *wb) { struct wb_writeback_work *work; long wrote = 0; set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); finish_writeback_work(work); } /* * Check for a flush-everything request */ wrote += wb_check_start_all(wb); /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); clear_bit(WB_writeback_running, &wb->state); return wrote; } /* * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); long pages_written; set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); if (likely(!current_is_workqueue_rescuer() || !test_bit(WB_registered, &wb->state))) { /* * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } if (!list_empty(&wb->work_list)) wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); } /* * Start writeback of all dirty pages on this bdi. */ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { struct bdi_writeback *wb; if (!bdi_has_dirty_io(bdi)) return; list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) wb_start_writeback(wb, reason); } void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { rcu_read_lock(); __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } /* * Wakeup the flusher threads to start writeback of all currently dirty pages */ void wakeup_flusher_threads(enum wb_reason reason) { struct backing_dev_info *bdi; /* * If we are expecting writeback progress we must submit plugged IO. */ blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } /* * Wake up bdi's periodically to make sure dirtytime inodes gets * written back periodically. We deliberately do *not* check the * b_dirtytime list in wb_has_dirty_io(), since this would cause the * kernel to be constantly waking up once there are any dirtytime * inodes on the system. So instead we define a separate delayed work * function which gets called much more rarely. (By default, only * once every 12 hours.) * * If there is any other write activity going on in the file system, * this function won't be necessary. But if the only thing that has * happened on the file system is a dirtytime inode caused by an atime * update, we need this infrastructure below to make sure that inode * eventually gets pushed out to disk. */ static void wakeup_dirtytime_writeback(struct work_struct *w); static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); static void wakeup_dirtytime_writeback(struct work_struct *w) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) if (!list_empty(&wb->b_dirty_time)) wb_wakeup(wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } static int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) mod_delayed_work(system_wq, &dirtytime_work, 0); return ret; } static const struct ctl_table vm_fs_writeback_table[] = { { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, .maxlen = sizeof(dirtytime_expire_interval), .mode = 0644, .proc_handler = dirtytime_interval_handler, .extra1 = SYSCTL_ZERO, }, }; static int __init start_dirtytime_writeback(void) { schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); register_sysctl_init("vm", vm_fs_writeback_table); return 0; } __initcall(start_dirtytime_writeback); /** * __mark_inode_dirty - internal function to mark an inode dirty * * @inode: inode to mark * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined * with I_DIRTY_PAGES. * * Mark an inode as dirty. We notify the filesystem, then update the inode's * dirty flags. Then, if needed we add the inode to the appropriate dirty list. * * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync() * instead of calling this directly. * * CAREFUL! We only add the inode to the dirty list if it is hashed or if it * refers to a blockdev. Unhashed inodes will never be added to the dirty list * even if they are later hashed, as they will have been marked dirty already. * * In short, ensure you hash any inodes _before_ you start marking them dirty. * * Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of * the kernel-internal blockdev inode represents the dirtying time of the * blockdev's pages. This is why for I_DIRTY_PAGES we always use * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; int dirtytime = 0; struct bdi_writeback *wb = NULL; trace_writeback_mark_inode_dirty(inode, flags); if (flags & I_DIRTY_INODE) { /* * Inode timestamp update will piggback on this dirtying. * We tell ->dirty_inode callback that timestamps need to * be updated by setting I_DIRTY_TIME in flags. */ if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); if (inode->i_state & I_DIRTY_TIME) { inode->i_state &= ~I_DIRTY_TIME; flags |= I_DIRTY_TIME; } spin_unlock(&inode->i_lock); } /* * Notify the filesystem about the inode being dirtied, so that * (if needed) it can update on-disk fields and journal the * inode. This is only needed when the inode itself is being * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not * for just I_DIRTY_PAGES or I_DIRTY_TIME. */ trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) sb->s_op->dirty_inode(inode, flags & (I_DIRTY_INODE | I_DIRTY_TIME)); trace_writeback_dirty_inode(inode, flags); /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ flags &= ~I_DIRTY_TIME; } else { /* * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing. * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME * in one call to __mark_inode_dirty().) */ dirtytime = flags & I_DIRTY_TIME; WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME); } /* * Paired with smp_mb() in __writeback_single_inode() for the * following lockless i_state test. See there for details. */ smp_mb(); if ((inode->i_state & flags) == flags) return; spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; inode_attach_wb(inode, NULL); inode->i_state |= flags; /* * Grab inode's wb early because it requires dropping i_lock and we * need to make sure following checks happen atomically with dirty * list handling so that we don't move inodes under flush worker's * hands. */ if (!was_dirty) { wb = locked_inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); } /* * If the inode is queued for writeback by flush worker, just * update its dirty state. Once the flush worker is done with * the inode it will place it on the appropriate superblock * list, based upon its state. */ if (inode->i_state & I_SYNC_QUEUED) goto out_unlock; /* * Only add valid (hashed) inodes to the superblock's * dirty list. Add blockdev inodes as well. */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) goto out_unlock; } if (inode->i_state & I_FREEING) goto out_unlock; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { struct list_head *dirty_list; bool wakeup_bdi = false; inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; if (inode->i_state & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); trace_writeback_dirty_inode_enqueue(inode); /* * If this is the first dirty inode for this bdi, * we have to wake-up the corresponding bdi thread * to make sure background write-back happens * later. */ if (wakeup_bdi && (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); return; } } out_unlock: if (wb) spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__mark_inode_dirty); /* * The @s_sync_lock is used to serialise concurrent sync operations * to avoid lock contention problems with concurrent wait_sb_inodes() calls. * Concurrent callers will block on the s_sync_lock rather than doing contending * walks. The queueing maintains sync(2) required behaviour as all the IO that * has been issued up to the time this function is enter is guaranteed to be * completed by the time we have gained the lock and waited for all IO that is * in progress regardless of the order callers are granted the lock. */ static void wait_sb_inodes(struct super_block *sb) { LIST_HEAD(sync_list); /* * We need to be protected against the filesystem going from * r/o to r/w or vice versa. */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); mutex_lock(&sb->s_sync_lock); /* * Splice the writeback list onto a temporary list to avoid waiting on * inodes that have started writeback after this point. * * Use rcu_read_lock() to keep the inodes around until we have a * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as * the local list because inodes can be dropped from either by writeback * completion. */ rcu_read_lock(); spin_lock_irq(&sb->s_inode_wblist_lock); list_splice_init(&sb->s_inodes_wb, &sync_list); /* * Data integrity sync. Must wait for all pages under writeback, because * there may have been pages dirtied before our sync call, but which had * writeout started before we write it out. In which case, the inode * may not be on the dirty list, but we still have to wait for that * writeout. */ while (!list_empty(&sync_list)) { struct inode *inode = list_first_entry(&sync_list, struct inode, i_wb_list); struct address_space *mapping = inode->i_mapping; /* * Move each inode back to the wb list before we drop the lock * to preserve consistency between i_wb_list and the mapping * writeback tag. Writeback completion is responsible to remove * the inode from either list once the writeback tag is cleared. */ list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); /* * The mapping can appear untagged while still on-list since we * do not have the mapping lock. Skip it here, wb completion * will remove it. */ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) continue; spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); spin_lock_irq(&sb->s_inode_wblist_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); /* * We keep the error status of individual mapping so that * applications can catch the writeback error using fsync(2). * See filemap_fdatawait_keep_errors() for details. */ filemap_fdatawait_keep_errors(mapping); cond_resched(); iput(inode); rcu_read_lock(); spin_lock_irq(&sb->s_inode_wblist_lock); } spin_unlock_irq(&sb->s_inode_wblist_lock); rcu_read_unlock(); mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason, bool skip_if_busy) { struct backing_dev_info *bdi = sb->s_bdi; DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, .tagged_writepages = 1, .done = &done, .nr_pages = nr, .reason = reason, }; if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); wb_wait_for_completion(&done); } /** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock * @nr: the number of pages to write * @reason: reason why some writeback work initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { __writeback_inodes_sb_nr(sb, nr, reason, false); } EXPORT_SYMBOL(writeback_inodes_sb_nr); /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock * @reason: reason why some writeback work was initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); /** * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock * @reason: reason why some writeback work was initiated * * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. */ void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { if (!down_read_trylock(&sb->s_umount)) return; __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); up_read(&sb->s_umount); } EXPORT_SYMBOL(try_to_writeback_inodes_sb); /** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this * super_block. */ void sync_inodes_sb(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, .reason = WB_REASON_SYNC, .for_sync = 1, }; /* * Can't skip on !bdi_has_dirty() because we should wait for !dirty * inodes under writeback and I_DIRTY_TIME inodes ignored by * bdi_has_dirty() need to be written out too. */ if (bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(&done); bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk * @inode: inode to write to disk * @sync: whether the write should be synchronous or not * * This function commits an inode to disk immediately if it is dirty. This is * primarily needed by knfsd. * * The caller must either have a ref on the inode or must have set I_WILL_FREE. */ int write_inode_now(struct inode *inode, int sync) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(write_inode_now); /** * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. * * Write an inode to disk and adjust its dirty state after completion. * * Note: only writes the actual inode, no associated data or other metadata. */ int sync_inode_metadata(struct inode *inode, int wait) { struct writeback_control wbc = { .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .nr_to_write = 0, /* metadata-only */ }; return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(sync_inode_metadata);
26 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/ptrace.c * * By Ross Biro 1/23/92 * edited by Linus Torvalds * ARM modifications Copyright (C) 2000 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/audit.h> #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/nospec.h> #include <linux/smp.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/seccomp.h> #include <linux/security.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/regset.h> #include <linux/elf.h> #include <linux/rseq.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/debug-monitors.h> #include <asm/fpsimd.h> #include <asm/gcs.h> #include <asm/mte.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> #include <asm/syscall.h> #include <asm/traps.h> #include <asm/system_misc.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> struct pt_regs_offset { const char *name; int offset; }; #define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} #define REG_OFFSET_END {.name = NULL, .offset = 0} #define GPR_OFFSET_NAME(r) \ {.name = "x" #r, .offset = offsetof(struct pt_regs, regs[r])} static const struct pt_regs_offset regoffset_table[] = { GPR_OFFSET_NAME(0), GPR_OFFSET_NAME(1), GPR_OFFSET_NAME(2), GPR_OFFSET_NAME(3), GPR_OFFSET_NAME(4), GPR_OFFSET_NAME(5), GPR_OFFSET_NAME(6), GPR_OFFSET_NAME(7), GPR_OFFSET_NAME(8), GPR_OFFSET_NAME(9), GPR_OFFSET_NAME(10), GPR_OFFSET_NAME(11), GPR_OFFSET_NAME(12), GPR_OFFSET_NAME(13), GPR_OFFSET_NAME(14), GPR_OFFSET_NAME(15), GPR_OFFSET_NAME(16), GPR_OFFSET_NAME(17), GPR_OFFSET_NAME(18), GPR_OFFSET_NAME(19), GPR_OFFSET_NAME(20), GPR_OFFSET_NAME(21), GPR_OFFSET_NAME(22), GPR_OFFSET_NAME(23), GPR_OFFSET_NAME(24), GPR_OFFSET_NAME(25), GPR_OFFSET_NAME(26), GPR_OFFSET_NAME(27), GPR_OFFSET_NAME(28), GPR_OFFSET_NAME(29), GPR_OFFSET_NAME(30), {.name = "lr", .offset = offsetof(struct pt_regs, regs[30])}, REG_OFFSET_NAME(sp), REG_OFFSET_NAME(pc), REG_OFFSET_NAME(pstate), REG_OFFSET_END, }; /** * regs_query_register_offset() - query register offset from its name * @name: the name of a register * * regs_query_register_offset() returns the offset of a register in struct * pt_regs from its name. If the name is invalid, this returns -EINVAL; */ int regs_query_register_offset(const char *name) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (!strcmp(roff->name, name)) return roff->offset; return -EINVAL; } /** * regs_within_kernel_stack() - check the address in the stack * @regs: pt_regs which contains kernel stack pointer. * @addr: address which is checked. * * regs_within_kernel_stack() checks @addr is within the kernel stack page(s). * If @addr is within the kernel stack, it returns true. If not, returns false. */ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || on_irq_stack(addr, sizeof(unsigned long)); } /** * regs_get_kernel_stack_nth() - get Nth entry of the stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which * is specified by @regs. If the @n th entry is NOT in the kernel stack, * this returns 0. */ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) { unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) return *addr; else return 0; } /* * TODO: does not yet catch signals sent when the child dies. * in exit.c or in signal.c. */ /* * Called by kernel/ptrace.c when detaching.. */ void ptrace_disable(struct task_struct *child) { /* * This would be better off in core code, but PTRACE_DETACH has * grown its fair share of arch-specific worts and changing it * is likely to cause regressions on obscure architectures. */ user_disable_single_step(child); } #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * Handle hitting a HW-breakpoint. */ static void ptrace_hbptriggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); const char *desc = "Hardware breakpoint trap (ptrace)"; if (is_compat_task()) { int si_errno = 0; int i; for (i = 0; i < ARM_MAX_BRP; ++i) { if (current->thread.debug.hbp_break[i] == bp) { si_errno = (i << 1) + 1; break; } } for (i = 0; i < ARM_MAX_WRP; ++i) { if (current->thread.debug.hbp_watch[i] == bp) { si_errno = -((i << 1) + 1); break; } } arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); return; } arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* * Unregister breakpoints from this task and reset the pointers in * the thread_struct. */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { int i; struct thread_struct *t = &tsk->thread; for (i = 0; i < ARM_MAX_BRP; i++) { if (t->debug.hbp_break[i]) { unregister_hw_breakpoint(t->debug.hbp_break[i]); t->debug.hbp_break[i] = NULL; } } for (i = 0; i < ARM_MAX_WRP; i++) { if (t->debug.hbp_watch[i]) { unregister_hw_breakpoint(t->debug.hbp_watch[i]); t->debug.hbp_watch[i] = NULL; } } } void ptrace_hw_copy_thread(struct task_struct *tsk) { memset(&tsk->thread.debug, 0, sizeof(struct debug_info)); } static struct perf_event *ptrace_hbp_get_event(unsigned int note_type, struct task_struct *tsk, unsigned long idx) { struct perf_event *bp = ERR_PTR(-EINVAL); switch (note_type) { case NT_ARM_HW_BREAK: if (idx >= ARM_MAX_BRP) goto out; idx = array_index_nospec(idx, ARM_MAX_BRP); bp = tsk->thread.debug.hbp_break[idx]; break; case NT_ARM_HW_WATCH: if (idx >= ARM_MAX_WRP) goto out; idx = array_index_nospec(idx, ARM_MAX_WRP); bp = tsk->thread.debug.hbp_watch[idx]; break; } out: return bp; } static int ptrace_hbp_set_event(unsigned int note_type, struct task_struct *tsk, unsigned long idx, struct perf_event *bp) { int err = -EINVAL; switch (note_type) { case NT_ARM_HW_BREAK: if (idx >= ARM_MAX_BRP) goto out; idx = array_index_nospec(idx, ARM_MAX_BRP); tsk->thread.debug.hbp_break[idx] = bp; err = 0; break; case NT_ARM_HW_WATCH: if (idx >= ARM_MAX_WRP) goto out; idx = array_index_nospec(idx, ARM_MAX_WRP); tsk->thread.debug.hbp_watch[idx] = bp; err = 0; break; } out: return err; } static struct perf_event *ptrace_hbp_create(unsigned int note_type, struct task_struct *tsk, unsigned long idx) { struct perf_event *bp; struct perf_event_attr attr; int err, type; switch (note_type) { case NT_ARM_HW_BREAK: type = HW_BREAKPOINT_X; break; case NT_ARM_HW_WATCH: type = HW_BREAKPOINT_RW; break; default: return ERR_PTR(-EINVAL); } ptrace_breakpoint_init(&attr); /* * Initialise fields to sane defaults * (i.e. values that will pass validation). */ attr.bp_addr = 0; attr.bp_len = HW_BREAKPOINT_LEN_4; attr.bp_type = type; attr.disabled = 1; bp = register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL, tsk); if (IS_ERR(bp)) return bp; err = ptrace_hbp_set_event(note_type, tsk, idx, bp); if (err) return ERR_PTR(err); return bp; } static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { int err, len, type, offset, disabled = !ctrl.enabled; attr->disabled = disabled; if (disabled) return 0; err = arch_bp_generic_fields(ctrl, &len, &type, &offset); if (err) return err; switch (note_type) { case NT_ARM_HW_BREAK: if ((type & HW_BREAKPOINT_X) != type) return -EINVAL; break; case NT_ARM_HW_WATCH: if ((type & HW_BREAKPOINT_RW) != type) return -EINVAL; break; default: return -EINVAL; } attr->bp_len = len; attr->bp_type = type; attr->bp_addr += offset; return 0; } static int ptrace_hbp_get_resource_info(unsigned int note_type, u32 *info) { u8 num; u32 reg = 0; switch (note_type) { case NT_ARM_HW_BREAK: num = hw_breakpoint_slots(TYPE_INST); break; case NT_ARM_HW_WATCH: num = hw_breakpoint_slots(TYPE_DATA); break; default: return -EINVAL; } reg |= debug_monitors_arch(); reg <<= 8; reg |= num; *info = reg; return 0; } static int ptrace_hbp_get_ctrl(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u32 *ctrl) { struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); if (IS_ERR(bp)) return PTR_ERR(bp); *ctrl = bp ? encode_ctrl_reg(counter_arch_bp(bp)->ctrl) : 0; return 0; } static int ptrace_hbp_get_addr(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u64 *addr) { struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); if (IS_ERR(bp)) return PTR_ERR(bp); *addr = bp ? counter_arch_bp(bp)->address : 0; return 0; } static struct perf_event *ptrace_hbp_get_initialised_bp(unsigned int note_type, struct task_struct *tsk, unsigned long idx) { struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); if (!bp) bp = ptrace_hbp_create(note_type, tsk, idx); return bp; } static int ptrace_hbp_set_ctrl(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u32 uctrl) { int err; struct perf_event *bp; struct perf_event_attr attr; struct arch_hw_breakpoint_ctrl ctrl; bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); if (IS_ERR(bp)) { err = PTR_ERR(bp); return err; } attr = bp->attr; decode_ctrl_reg(uctrl, &ctrl); err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); if (err) return err; return modify_user_hw_breakpoint(bp, &attr); } static int ptrace_hbp_set_addr(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u64 addr) { int err; struct perf_event *bp; struct perf_event_attr attr; bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); if (IS_ERR(bp)) { err = PTR_ERR(bp); return err; } attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); return err; } #define PTRACE_HBP_ADDR_SZ sizeof(u64) #define PTRACE_HBP_CTRL_SZ sizeof(u32) #define PTRACE_HBP_PAD_SZ sizeof(u32) static int hw_break_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { unsigned int note_type = regset->core_note_type; int ret, idx = 0; u32 info, ctrl; u64 addr; /* Resource info */ ret = ptrace_hbp_get_resource_info(note_type, &info); if (ret) return ret; membuf_write(&to, &info, sizeof(info)); membuf_zero(&to, sizeof(u32)); /* (address, ctrl) registers */ while (to.left) { ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); if (ret) return ret; ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl); if (ret) return ret; membuf_store(&to, addr); membuf_store(&to, ctrl); membuf_zero(&to, sizeof(u32)); idx++; } return 0; } static int hw_break_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { unsigned int note_type = regset->core_note_type; int ret, idx = 0, offset, limit; u32 ctrl; u64 addr; /* Resource info and pad */ offset = offsetof(struct user_hwdebug_state, dbg_regs); user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); /* (address, ctrl) registers */ limit = regset->n * regset->size; while (count && offset < limit) { if (count < PTRACE_HBP_ADDR_SZ) return -EINVAL; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &addr, offset, offset + PTRACE_HBP_ADDR_SZ); if (ret) return ret; ret = ptrace_hbp_set_addr(note_type, target, idx, addr); if (ret) return ret; offset += PTRACE_HBP_ADDR_SZ; if (!count) break; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, offset, offset + PTRACE_HBP_CTRL_SZ); if (ret) return ret; ret = ptrace_hbp_set_ctrl(note_type, target, idx, ctrl); if (ret) return ret; offset += PTRACE_HBP_CTRL_SZ; user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, offset, offset + PTRACE_HBP_PAD_SZ); offset += PTRACE_HBP_PAD_SZ; idx++; } return 0; } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ static int gpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs; return membuf_write(&to, uregs, sizeof(*uregs)); } static int gpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; struct user_pt_regs newregs = task_pt_regs(target)->user_regs; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newregs, 0, -1); if (ret) return ret; if (!valid_user_regs(&newregs, target)) return -EINVAL; task_pt_regs(target)->user_regs = newregs; return 0; } static int fpr_active(struct task_struct *target, const struct user_regset *regset) { if (!system_supports_fpsimd()) return -ENODEV; return regset->n; } /* * TODO: update fp accessors for lazy context switching (sync/flush hwstate) */ static int __fpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_fpsimd_state *uregs; sve_sync_to_fpsimd(target); uregs = &target->thread.uw.fpsimd_state; return membuf_write(&to, uregs, sizeof(*uregs)); } static int fpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_fpsimd()) return -EINVAL; if (target == current) fpsimd_preserve_current_state(); return __fpr_get(target, regset, to); } static int __fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf, unsigned int start_pos) { int ret; struct user_fpsimd_state newstate; /* * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ sve_sync_to_fpsimd(target); newstate = target->thread.uw.fpsimd_state; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, start_pos, start_pos + sizeof(newstate)); if (ret) return ret; target->thread.uw.fpsimd_state = newstate; return ret; } static int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; if (!system_supports_fpsimd()) return -EINVAL; ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0); if (ret) return ret; sve_sync_from_fpsimd_zeropad(target); fpsimd_flush_task_state(target); return ret; } static int tls_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int ret; if (target == current) tls_preserve_current_state(); ret = membuf_store(&to, target->thread.uw.tp_value); if (system_supports_tpidr2()) ret = membuf_store(&to, target->thread.tpidr2_el0); else ret = membuf_zero(&to, sizeof(u64)); return ret; } static int tls_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; unsigned long tls[2]; tls[0] = target->thread.uw.tp_value; if (system_supports_tpidr2()) tls[1] = target->thread.tpidr2_el0; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, tls, 0, count); if (ret) return ret; target->thread.uw.tp_value = tls[0]; if (system_supports_tpidr2()) target->thread.tpidr2_el0 = tls[1]; return ret; } static int fpmr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_fpmr()) return -EINVAL; if (target == current) fpsimd_preserve_current_state(); return membuf_store(&to, target->thread.uw.fpmr); } static int fpmr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; unsigned long fpmr; if (!system_supports_fpmr()) return -EINVAL; fpmr = target->thread.uw.fpmr; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count); if (ret) return ret; target->thread.uw.fpmr = fpmr; fpsimd_flush_task_state(target); return 0; } static int system_call_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { return membuf_store(&to, task_pt_regs(target)->syscallno); } static int system_call_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int syscallno = task_pt_regs(target)->syscallno; int ret; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &syscallno, 0, -1); if (ret) return ret; task_pt_regs(target)->syscallno = syscallno; return ret; } #ifdef CONFIG_ARM64_SVE static void sve_init_header_from_task(struct user_sve_header *header, struct task_struct *target, enum vec_type type) { unsigned int vq; bool active; enum vec_type task_type; memset(header, 0, sizeof(*header)); /* Check if the requested registers are active for the task */ if (thread_sm_enabled(&target->thread)) task_type = ARM64_VEC_SME; else task_type = ARM64_VEC_SVE; active = (task_type == type); switch (type) { case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; break; case ARM64_VEC_SME: if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; break; default: WARN_ON_ONCE(1); return; } if (active) { if (target->thread.fp_type == FP_STATE_FPSIMD) { header->flags |= SVE_PT_REGS_FPSIMD; } else { header->flags |= SVE_PT_REGS_SVE; } } header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); header->max_vl = vec_max_vl(type); header->size = SVE_PT_SIZE(vq, header->flags); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } static unsigned int sve_size_from_header(struct user_sve_header const *header) { return ALIGN(header->size, SVE_VQ_BYTES); } static int sve_get_common(struct task_struct *target, const struct user_regset *regset, struct membuf to, enum vec_type type) { struct user_sve_header header; unsigned int vq; unsigned long start, end; /* Header */ sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); if (target == current) fpsimd_preserve_current_state(); BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); switch ((header.flags & SVE_PT_REGS_MASK)) { case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); case SVE_PT_REGS_SVE: start = SVE_PT_SVE_OFFSET; end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); membuf_write(&to, target->thread.sve_state, end - start); start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); membuf_zero(&to, end - start); /* * Copy fpsr, and fpcr which must follow contiguously in * struct fpsimd_state: */ start = end; end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, end - start); start = end; end = sve_size_from_header(&header); return membuf_zero(&to, end - start); default: return 0; } } static int sve_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_sve()) return -EINVAL; return sve_get_common(target, regset, to, ARM64_VEC_SVE); } static int sve_set_common(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf, enum vec_type type) { int ret; struct user_sve_header header; unsigned int vq; unsigned long start, end; /* Header */ if (count < sizeof(header)) return -EINVAL; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) goto out; /* * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by * vec_set_vector_length(), which will also validate them for us: */ ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) goto out; /* * Actual VL set may be different from what the user asked * for, or we may have configured the _ONEXEC VL not the * current VL: */ vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ if (system_supports_sme()) { u64 old_svcr = target->thread.svcr; switch (type) { case ARM64_VEC_SVE: target->thread.svcr &= ~SVCR_SM_MASK; break; case ARM64_VEC_SME: target->thread.svcr |= SVCR_SM_MASK; /* * Disable traps and ensure there is SME storage but * preserve any currently set values in ZA/ZT. */ sme_alloc(target, false); set_tsk_thread_flag(target, TIF_SME); break; default: WARN_ON_ONCE(1); ret = -EINVAL; goto out; } /* * If we switched then invalidate any existing SVE * state and ensure there's storage. */ if (target->thread.svcr != old_svcr) sve_alloc(target, true); } /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, SVE_PT_FPSIMD_OFFSET); clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; goto out; } /* * Otherwise: no registers or full SVE case. For backwards * compatibility reasons we treat empty flags as SVE registers. */ /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ if (count && vq != sve_vq_from_vl(header.vl)) { ret = -EIO; goto out; } sve_alloc(target, true); if (!target->thread.sve_state) { ret = -ENOMEM; clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; goto out; } /* * Ensure target->thread.sve_state is up to date with target's * FPSIMD regs, so that a short copyin leaves trailing * registers unmodified. Only enable SVE if we are * configuring normal SVE, a system with streaming SVE may not * have normal SVE. */ fpsimd_sync_to_sve(target); if (type == ARM64_VEC_SVE) set_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_SVE; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, target->thread.sve_state, start, end); if (ret) goto out; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, start, end); /* * Copy fpsr, and fpcr which must follow contiguously in * struct fpsimd_state: */ start = end; end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.uw.fpsimd_state.fpsr, start, end); out: fpsimd_flush_task_state(target); return ret; } static int sve_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { if (!system_supports_sve()) return -EINVAL; return sve_set_common(target, regset, pos, count, kbuf, ubuf, ARM64_VEC_SVE); } #endif /* CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME static int ssve_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_sme()) return -EINVAL; return sve_get_common(target, regset, to, ARM64_VEC_SME); } static int ssve_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { if (!system_supports_sme()) return -EINVAL; return sve_set_common(target, regset, pos, count, kbuf, ubuf, ARM64_VEC_SME); } static int za_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_za_header header; unsigned int vq; unsigned long start, end; if (!system_supports_sme()) return -EINVAL; /* Header */ memset(&header, 0, sizeof(header)); if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header.flags |= ZA_PT_VL_INHERIT; header.vl = task_get_sme_vl(target); vq = sve_vq_from_vl(header.vl); header.max_vl = sme_max_vl(); header.max_size = ZA_PT_SIZE(vq); /* If ZA is not active there is only the header */ if (thread_za_enabled(&target->thread)) header.size = ZA_PT_SIZE(vq); else header.size = ZA_PT_ZA_OFFSET; membuf_write(&to, &header, sizeof(header)); BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); end = ZA_PT_ZA_OFFSET; if (target == current) fpsimd_preserve_current_state(); /* Any register data to include? */ if (thread_za_enabled(&target->thread)) { start = end; end = ZA_PT_SIZE(vq); membuf_write(&to, target->thread.sme_state, end - start); } /* Zero any trailing padding */ start = end; end = ALIGN(header.size, SVE_VQ_BYTES); return membuf_zero(&to, end - start); } static int za_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; struct user_za_header header; unsigned int vq; unsigned long start, end; if (!system_supports_sme()) return -EINVAL; /* Header */ if (count < sizeof(header)) return -EINVAL; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) goto out; /* * All current ZA_PT_* flags are consumed by * vec_set_vector_length(), which will also validate them for * us: */ ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl, ((unsigned long)header.flags) << 16); if (ret) goto out; /* * Actual VL set may be different from what the user asked * for, or we may have configured the _ONEXEC rather than * current VL: */ vq = sve_vq_from_vl(task_get_sme_vl(target)); /* Ensure there is some SVE storage for streaming mode */ if (!target->thread.sve_state) { sve_alloc(target, false); if (!target->thread.sve_state) { ret = -ENOMEM; goto out; } } /* * Only flush the storage if PSTATE.ZA was not already set, * otherwise preserve any existing data. */ sme_alloc(target, !thread_za_enabled(&target->thread)); if (!target->thread.sme_state) return -ENOMEM; /* If there is no data then disable ZA */ if (!count) { target->thread.svcr &= ~SVCR_ZA_MASK; goto out; } /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ if (vq != sve_vq_from_vl(header.vl)) { ret = -EIO; goto out; } BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); start = ZA_PT_ZA_OFFSET; end = ZA_PT_SIZE(vq); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, target->thread.sme_state, start, end); if (ret) goto out; /* Mark ZA as active and let userspace use it */ set_tsk_thread_flag(target, TIF_SME); target->thread.svcr |= SVCR_ZA_MASK; out: fpsimd_flush_task_state(target); return ret; } static int zt_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_sme2()) return -EINVAL; /* * If PSTATE.ZA is not set then ZT will be zeroed when it is * enabled so report the current register value as zero. */ if (thread_za_enabled(&target->thread)) membuf_write(&to, thread_zt_state(&target->thread), ZT_SIG_REG_BYTES); else membuf_zero(&to, ZT_SIG_REG_BYTES); return 0; } static int zt_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; if (!system_supports_sme2()) return -EINVAL; /* Ensure SVE storage in case this is first use of SME */ sve_alloc(target, false); if (!target->thread.sve_state) return -ENOMEM; if (!thread_za_enabled(&target->thread)) { sme_alloc(target, true); if (!target->thread.sme_state) return -ENOMEM; } ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, thread_zt_state(&target->thread), 0, ZT_SIG_REG_BYTES); if (ret == 0) { target->thread.svcr |= SVCR_ZA_MASK; set_tsk_thread_flag(target, TIF_SME); } fpsimd_flush_task_state(target); return ret; } #endif /* CONFIG_ARM64_SME */ #ifdef CONFIG_ARM64_PTR_AUTH static int pac_mask_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { /* * The PAC bits can differ across data and instruction pointers * depending on TCR_EL1.TBID*, which we may make use of in future, so * we expose separate masks. */ unsigned long mask = ptrauth_user_pac_mask(); struct user_pac_mask uregs = { .data_mask = mask, .insn_mask = mask, }; if (!system_supports_address_auth()) return -EINVAL; return membuf_write(&to, &uregs, sizeof(uregs)); } static int pac_enabled_keys_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { long enabled_keys = ptrauth_get_enabled_keys(target); if (IS_ERR_VALUE(enabled_keys)) return enabled_keys; return membuf_write(&to, &enabled_keys, sizeof(enabled_keys)); } static int pac_enabled_keys_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; long enabled_keys = ptrauth_get_enabled_keys(target); if (IS_ERR_VALUE(enabled_keys)) return enabled_keys; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0, sizeof(long)); if (ret) return ret; return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK, enabled_keys); } #ifdef CONFIG_CHECKPOINT_RESTORE static __uint128_t pac_key_to_user(const struct ptrauth_key *key) { return (__uint128_t)key->hi << 64 | key->lo; } static struct ptrauth_key pac_key_from_user(__uint128_t ukey) { struct ptrauth_key key = { .lo = (unsigned long)ukey, .hi = (unsigned long)(ukey >> 64), }; return key; } static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, const struct ptrauth_keys_user *keys) { ukeys->apiakey = pac_key_to_user(&keys->apia); ukeys->apibkey = pac_key_to_user(&keys->apib); ukeys->apdakey = pac_key_to_user(&keys->apda); ukeys->apdbkey = pac_key_to_user(&keys->apdb); } static void pac_address_keys_from_user(struct ptrauth_keys_user *keys, const struct user_pac_address_keys *ukeys) { keys->apia = pac_key_from_user(ukeys->apiakey); keys->apib = pac_key_from_user(ukeys->apibkey); keys->apda = pac_key_from_user(ukeys->apdakey); keys->apdb = pac_key_from_user(ukeys->apdbkey); } static int pac_address_keys_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_address_keys user_keys; if (!system_supports_address_auth()) return -EINVAL; pac_address_keys_to_user(&user_keys, keys); return membuf_write(&to, &user_keys, sizeof(user_keys)); } static int pac_address_keys_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_address_keys user_keys; int ret; if (!system_supports_address_auth()) return -EINVAL; pac_address_keys_to_user(&user_keys, keys); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_keys, 0, -1); if (ret) return ret; pac_address_keys_from_user(keys, &user_keys); return 0; } static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys, const struct ptrauth_keys_user *keys) { ukeys->apgakey = pac_key_to_user(&keys->apga); } static void pac_generic_keys_from_user(struct ptrauth_keys_user *keys, const struct user_pac_generic_keys *ukeys) { keys->apga = pac_key_from_user(ukeys->apgakey); } static int pac_generic_keys_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_generic_keys user_keys; if (!system_supports_generic_auth()) return -EINVAL; pac_generic_keys_to_user(&user_keys, keys); return membuf_write(&to, &user_keys, sizeof(user_keys)); } static int pac_generic_keys_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_generic_keys user_keys; int ret; if (!system_supports_generic_auth()) return -EINVAL; pac_generic_keys_to_user(&user_keys, keys); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_keys, 0, -1); if (ret) return ret; pac_generic_keys_from_user(keys, &user_keys); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI static int tagged_addr_ctrl_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { long ctrl = get_tagged_addr_ctrl(target); if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) return ctrl; return membuf_write(&to, &ctrl, sizeof(ctrl)); } static int tagged_addr_ctrl_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; long ctrl; ctrl = get_tagged_addr_ctrl(target); if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) return ctrl; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; return set_tagged_addr_ctrl(target, ctrl); } #endif #ifdef CONFIG_ARM64_POE static int poe_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_poe()) return -EINVAL; return membuf_write(&to, &target->thread.por_el0, sizeof(target->thread.por_el0)); } static int poe_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; long ctrl; if (!system_supports_poe()) return -EINVAL; ctrl = target->thread.por_el0; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; target->thread.por_el0 = ctrl; return 0; } #endif #ifdef CONFIG_ARM64_GCS static void task_gcs_to_user(struct user_gcs *user_gcs, const struct task_struct *target) { user_gcs->features_enabled = target->thread.gcs_el0_mode; user_gcs->features_locked = target->thread.gcs_el0_locked; user_gcs->gcspr_el0 = target->thread.gcspr_el0; } static void task_gcs_from_user(struct task_struct *target, const struct user_gcs *user_gcs) { target->thread.gcs_el0_mode = user_gcs->features_enabled; target->thread.gcs_el0_locked = user_gcs->features_locked; target->thread.gcspr_el0 = user_gcs->gcspr_el0; } static int gcs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_gcs user_gcs; if (!system_supports_gcs()) return -EINVAL; if (target == current) gcs_preserve_current_state(); task_gcs_to_user(&user_gcs, target); return membuf_write(&to, &user_gcs, sizeof(user_gcs)); } static int gcs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; struct user_gcs user_gcs; if (!system_supports_gcs()) return -EINVAL; task_gcs_to_user(&user_gcs, target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); if (ret) return ret; if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) return -EINVAL; task_gcs_from_user(target, &user_gcs); return 0; } #endif enum aarch64_regset { REGSET_GPR, REGSET_FPR, REGSET_TLS, #ifdef CONFIG_HAVE_HW_BREAKPOINT REGSET_HW_BREAK, REGSET_HW_WATCH, #endif REGSET_FPMR, REGSET_SYSTEM_CALL, #ifdef CONFIG_ARM64_SVE REGSET_SVE, #endif #ifdef CONFIG_ARM64_SME REGSET_SSVE, REGSET_ZA, REGSET_ZT, #endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, REGSET_PAC_ENABLED_KEYS, #ifdef CONFIG_CHECKPOINT_RESTORE REGSET_PACA_KEYS, REGSET_PACG_KEYS, #endif #endif #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif #ifdef CONFIG_ARM64_POE REGSET_POE, #endif #ifdef CONFIG_ARM64_GCS REGSET_GCS, #endif }; static const struct user_regset aarch64_regsets[] = { [REGSET_GPR] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), .regset_get = gpr_get, .set = gpr_set }, [REGSET_FPR] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_fpsimd_state) / sizeof(u32), /* * We pretend we have 32-bit registers because the fpsr and * fpcr are 32-bits wide. */ .size = sizeof(u32), .align = sizeof(u32), .active = fpr_active, .regset_get = fpr_get, .set = fpr_set }, [REGSET_TLS] = { .core_note_type = NT_ARM_TLS, .n = 2, .size = sizeof(void *), .align = sizeof(void *), .regset_get = tls_get, .set = tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { .core_note_type = NT_ARM_HW_BREAK, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { .core_note_type = NT_ARM_HW_WATCH, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, #endif [REGSET_SYSTEM_CALL] = { .core_note_type = NT_ARM_SYSTEM_CALL, .n = 1, .size = sizeof(int), .align = sizeof(int), .regset_get = system_call_get, .set = system_call_set, }, [REGSET_FPMR] = { .core_note_type = NT_ARM_FPMR, .n = 1, .size = sizeof(u64), .align = sizeof(u64), .regset_get = fpmr_get, .set = fpmr_set, }, #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ .core_note_type = NT_ARM_SVE, .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, .regset_get = sve_get, .set = sve_set, }, #endif #ifdef CONFIG_ARM64_SME [REGSET_SSVE] = { /* Streaming mode SVE */ .core_note_type = NT_ARM_SSVE, .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, .regset_get = ssve_get, .set = ssve_set, }, [REGSET_ZA] = { /* SME ZA */ .core_note_type = NT_ARM_ZA, /* * ZA is a single register but it's variably sized and * the ptrace core requires that the size of any data * be an exact multiple of the configured register * size so report as though we had SVE_VQ_BYTES * registers. These values aren't exposed to * userspace. */ .n = DIV_ROUND_UP(ZA_PT_SIZE(SME_VQ_MAX), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, .regset_get = za_get, .set = za_set, }, [REGSET_ZT] = { /* SME ZT */ .core_note_type = NT_ARM_ZT, .n = 1, .size = ZT_SIG_REG_BYTES, .align = sizeof(u64), .regset_get = zt_get, .set = zt_set, }, #endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { .core_note_type = NT_ARM_PAC_MASK, .n = sizeof(struct user_pac_mask) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), .regset_get = pac_mask_get, /* this cannot be set dynamically */ }, [REGSET_PAC_ENABLED_KEYS] = { .core_note_type = NT_ARM_PAC_ENABLED_KEYS, .n = 1, .size = sizeof(long), .align = sizeof(long), .regset_get = pac_enabled_keys_get, .set = pac_enabled_keys_set, }, #ifdef CONFIG_CHECKPOINT_RESTORE [REGSET_PACA_KEYS] = { .core_note_type = NT_ARM_PACA_KEYS, .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), .regset_get = pac_address_keys_get, .set = pac_address_keys_set, }, [REGSET_PACG_KEYS] = { .core_note_type = NT_ARM_PACG_KEYS, .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), .regset_get = pac_generic_keys_get, .set = pac_generic_keys_set, }, #endif #endif #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI [REGSET_TAGGED_ADDR_CTRL] = { .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, .n = 1, .size = sizeof(long), .align = sizeof(long), .regset_get = tagged_addr_ctrl_get, .set = tagged_addr_ctrl_set, }, #endif #ifdef CONFIG_ARM64_POE [REGSET_POE] = { .core_note_type = NT_ARM_POE, .n = 1, .size = sizeof(long), .align = sizeof(long), .regset_get = poe_get, .set = poe_set, }, #endif #ifdef CONFIG_ARM64_GCS [REGSET_GCS] = { .core_note_type = NT_ARM_GCS, .n = sizeof(struct user_gcs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), .regset_get = gcs_get, .set = gcs_set, }, #endif }; static const struct user_regset_view user_aarch64_view = { .name = "aarch64", .e_machine = EM_AARCH64, .regsets = aarch64_regsets, .n = ARRAY_SIZE(aarch64_regsets) }; enum compat_regset { REGSET_COMPAT_GPR, REGSET_COMPAT_VFP, }; static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int idx) { struct pt_regs *regs = task_pt_regs(task); switch (idx) { case 15: return regs->pc; case 16: return pstate_to_compat_psr(regs->pstate); case 17: return regs->orig_x0; default: return regs->regs[idx]; } } static int compat_gpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int i = 0; while (to.left) membuf_store(&to, compat_get_user_reg(target, i++)); return 0; } static int compat_gpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct pt_regs newregs; int ret = 0; unsigned int i, start, num_regs; /* Calculate the number of AArch32 registers contained in count */ num_regs = count / regset->size; /* Convert pos into an register number */ start = pos / regset->size; if (start + num_regs > regset->n) return -EIO; newregs = *task_pt_regs(target); for (i = 0; i < num_regs; ++i) { unsigned int idx = start + i; compat_ulong_t reg; if (kbuf) { memcpy(&reg, kbuf, sizeof(reg)); kbuf += sizeof(reg); } else { ret = copy_from_user(&reg, ubuf, sizeof(reg)); if (ret) { ret = -EFAULT; break; } ubuf += sizeof(reg); } switch (idx) { case 15: newregs.pc = reg; break; case 16: reg = compat_psr_to_pstate(reg); newregs.pstate = reg; break; case 17: newregs.orig_x0 = reg; break; default: newregs.regs[idx] = reg; } } if (valid_user_regs(&newregs.user_regs, target)) *task_pt_regs(target) = newregs; else ret = -EINVAL; return ret; } static int compat_vfp_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; if (!system_supports_fpsimd()) return -EINVAL; uregs = &target->thread.uw.fpsimd_state; if (target == current) fpsimd_preserve_current_state(); /* * The VFP registers are packed into the fpsimd_state, so they all sit * nicely together for us. We just need to create the fpscr separately. */ membuf_write(&to, uregs, VFP_STATE_SIZE - sizeof(compat_ulong_t)); fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | (uregs->fpcr & VFP_FPSCR_CTRL_MASK); return membuf_store(&to, fpscr); } static int compat_vfp_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; int ret, vregs_end_pos; if (!system_supports_fpsimd()) return -EINVAL; uregs = &target->thread.uw.fpsimd_state; vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, uregs, 0, vregs_end_pos); if (count && !ret) { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpscr, vregs_end_pos, VFP_STATE_SIZE); if (!ret) { uregs->fpsr = fpscr & VFP_FPSCR_STAT_MASK; uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; } } fpsimd_flush_task_state(target); return ret; } static int compat_tls_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { return membuf_store(&to, (compat_ulong_t)target->thread.uw.tp_value); } static int compat_tls_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; compat_ulong_t tls = target->thread.uw.tp_value; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); if (ret) return ret; target->thread.uw.tp_value = tls; return ret; } static const struct user_regset aarch32_regsets[] = { [REGSET_COMPAT_GPR] = { .core_note_type = NT_PRSTATUS, .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { .core_note_type = NT_ARM_VFP, .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), .active = fpr_active, .regset_get = compat_vfp_get, .set = compat_vfp_set }, }; static const struct user_regset_view user_aarch32_view = { .name = "aarch32", .e_machine = EM_ARM, .regsets = aarch32_regsets, .n = ARRAY_SIZE(aarch32_regsets) }; static const struct user_regset aarch32_ptrace_regsets[] = { [REGSET_GPR] = { .core_note_type = NT_PRSTATUS, .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_FPR] = { .core_note_type = NT_ARM_VFP, .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), .regset_get = compat_vfp_get, .set = compat_vfp_set }, [REGSET_TLS] = { .core_note_type = NT_ARM_TLS, .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), .regset_get = compat_tls_get, .set = compat_tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { .core_note_type = NT_ARM_HW_BREAK, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { .core_note_type = NT_ARM_HW_WATCH, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, #endif [REGSET_SYSTEM_CALL] = { .core_note_type = NT_ARM_SYSTEM_CALL, .n = 1, .size = sizeof(int), .align = sizeof(int), .regset_get = system_call_get, .set = system_call_set, }, }; static const struct user_regset_view user_aarch32_ptrace_view = { .name = "aarch32", .e_machine = EM_ARM, .regsets = aarch32_ptrace_regsets, .n = ARRAY_SIZE(aarch32_ptrace_regsets) }; #ifdef CONFIG_COMPAT static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t __user *ret) { compat_ulong_t tmp; if (off & 3) return -EIO; if (off == COMPAT_PT_TEXT_ADDR) tmp = tsk->mm->start_code; else if (off == COMPAT_PT_DATA_ADDR) tmp = tsk->mm->start_data; else if (off == COMPAT_PT_TEXT_END_ADDR) tmp = tsk->mm->end_code; else if (off < sizeof(compat_elf_gregset_t)) tmp = compat_get_user_reg(tsk, off >> 2); else if (off >= COMPAT_USER_SZ) return -EIO; else tmp = 0; return put_user(tmp, ret); } static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t val) { struct pt_regs newregs = *task_pt_regs(tsk); unsigned int idx = off / 4; if (off & 3 || off >= COMPAT_USER_SZ) return -EIO; if (off >= sizeof(compat_elf_gregset_t)) return 0; switch (idx) { case 15: newregs.pc = val; break; case 16: newregs.pstate = compat_psr_to_pstate(val); break; case 17: newregs.orig_x0 = val; break; default: newregs.regs[idx] = val; } if (!valid_user_regs(&newregs.user_regs, tsk)) return -EINVAL; *task_pt_regs(tsk) = newregs; return 0; } #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * Convert a virtual register number into an index for a thread_info * breakpoint array. Breakpoints are identified using positive numbers * whilst watchpoints are negative. The registers are laid out as pairs * of (address, control), each pair mapping to a unique hw_breakpoint struct. * Register 0 is reserved for describing resource information. */ static int compat_ptrace_hbp_num_to_idx(compat_long_t num) { return (abs(num) - 1) >> 1; } static int compat_ptrace_hbp_get_resource_info(u32 *kdata) { u8 num_brps, num_wrps, debug_arch, wp_len; u32 reg = 0; num_brps = hw_breakpoint_slots(TYPE_INST); num_wrps = hw_breakpoint_slots(TYPE_DATA); debug_arch = debug_monitors_arch(); wp_len = 8; reg |= debug_arch; reg <<= 8; reg |= wp_len; reg <<= 8; reg |= num_wrps; reg <<= 8; reg |= num_brps; *kdata = reg; return 0; } static int compat_ptrace_hbp_get(unsigned int note_type, struct task_struct *tsk, compat_long_t num, u32 *kdata) { u64 addr = 0; u32 ctrl = 0; int err, idx = compat_ptrace_hbp_num_to_idx(num); if (num & 1) { err = ptrace_hbp_get_addr(note_type, tsk, idx, &addr); *kdata = (u32)addr; } else { err = ptrace_hbp_get_ctrl(note_type, tsk, idx, &ctrl); *kdata = ctrl; } return err; } static int compat_ptrace_hbp_set(unsigned int note_type, struct task_struct *tsk, compat_long_t num, u32 *kdata) { u64 addr; u32 ctrl; int err, idx = compat_ptrace_hbp_num_to_idx(num); if (num & 1) { addr = *kdata; err = ptrace_hbp_set_addr(note_type, tsk, idx, addr); } else { ctrl = *kdata; err = ptrace_hbp_set_ctrl(note_type, tsk, idx, ctrl); } return err; } static int compat_ptrace_gethbpregs(struct task_struct *tsk, compat_long_t num, compat_ulong_t __user *data) { int ret; u32 kdata; /* Watchpoint */ if (num < 0) { ret = compat_ptrace_hbp_get(NT_ARM_HW_WATCH, tsk, num, &kdata); /* Resource info */ } else if (num == 0) { ret = compat_ptrace_hbp_get_resource_info(&kdata); /* Breakpoint */ } else { ret = compat_ptrace_hbp_get(NT_ARM_HW_BREAK, tsk, num, &kdata); } if (!ret) ret = put_user(kdata, data); return ret; } static int compat_ptrace_sethbpregs(struct task_struct *tsk, compat_long_t num, compat_ulong_t __user *data) { int ret; u32 kdata = 0; if (num == 0) return 0; ret = get_user(kdata, data); if (ret) return ret; if (num < 0) ret = compat_ptrace_hbp_set(NT_ARM_HW_WATCH, tsk, num, &kdata); else ret = compat_ptrace_hbp_set(NT_ARM_HW_BREAK, tsk, num, &kdata); return ret; } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; switch (request) { case PTRACE_PEEKUSR: ret = compat_ptrace_read_user(child, addr, datap); break; case PTRACE_POKEUSR: ret = compat_ptrace_write_user(child, addr, data); break; case COMPAT_PTRACE_GETREGS: ret = copy_regset_to_user(child, &user_aarch32_view, REGSET_COMPAT_GPR, 0, sizeof(compat_elf_gregset_t), datap); break; case COMPAT_PTRACE_SETREGS: ret = copy_regset_from_user(child, &user_aarch32_view, REGSET_COMPAT_GPR, 0, sizeof(compat_elf_gregset_t), datap); break; case COMPAT_PTRACE_GET_THREAD_AREA: ret = put_user((compat_ulong_t)child->thread.uw.tp_value, (compat_ulong_t __user *)datap); break; case COMPAT_PTRACE_SET_SYSCALL: task_pt_regs(child)->syscallno = data; ret = 0; break; case COMPAT_PTRACE_GETVFPREGS: ret = copy_regset_to_user(child, &user_aarch32_view, REGSET_COMPAT_VFP, 0, VFP_STATE_SIZE, datap); break; case COMPAT_PTRACE_SETVFPREGS: ret = copy_regset_from_user(child, &user_aarch32_view, REGSET_COMPAT_VFP, 0, VFP_STATE_SIZE, datap); break; #ifdef CONFIG_HAVE_HW_BREAKPOINT case COMPAT_PTRACE_GETHBPREGS: ret = compat_ptrace_gethbpregs(child, addr, datap); break; case COMPAT_PTRACE_SETHBPREGS: ret = compat_ptrace_sethbpregs(child, addr, datap); break; #endif default: ret = compat_ptrace_request(child, request, addr, data); break; } return ret; } #endif /* CONFIG_COMPAT */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { /* * Core dumping of 32-bit tasks or compat ptrace requests must use the * user_aarch32_view compatible with arm32. Native ptrace requests on * 32-bit children use an extended user_aarch32_ptrace_view to allow * access to the TLS register. */ if (is_compat_task()) return &user_aarch32_view; else if (is_compat_thread(task_thread_info(task))) return &user_aarch32_ptrace_view; return &user_aarch64_view; } long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { switch (request) { case PTRACE_PEEKMTETAGS: case PTRACE_POKEMTETAGS: return mte_ptrace_copy_tags(child, request, addr, data); } return ptrace_request(child, request, addr, data); } enum ptrace_syscall_dir { PTRACE_SYSCALL_ENTER = 0, PTRACE_SYSCALL_EXIT, }; static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) { int regno; unsigned long saved_reg; /* * We have some ABI weirdness here in the way that we handle syscall * exit stops because we indicate whether or not the stop has been * signalled from syscall entry or syscall exit by clobbering a general * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee * and restoring its old value after the stop. This means that: * * - Any writes by the tracer to this register during the stop are * ignored/discarded. * * - The actual value of the register is not available during the stop, * so the tracer cannot save it and restore it later. * * - Syscall stops behave differently to seccomp and pseudo-step traps * (the latter do not nobble any registers). */ regno = (is_compat_task() ? 12 : 7); saved_reg = regs->regs[regno]; regs->regs[regno] = dir; if (dir == PTRACE_SYSCALL_ENTER) { if (ptrace_report_syscall_entry(regs)) forget_syscall(regs); regs->regs[regno] = saved_reg; } else if (!test_thread_flag(TIF_SINGLESTEP)) { ptrace_report_syscall_exit(regs, 0); regs->regs[regno] = saved_reg; } else { regs->regs[regno] = saved_reg; /* * Signal a pseudo-step exception since we are stepping but * tracer modifications to the registers may have rewound the * state machine. */ ptrace_report_syscall_exit(regs, 1); } } int syscall_trace_enter(struct pt_regs *regs) { unsigned long flags = read_thread_flags(); if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { report_syscall(regs, PTRACE_SYSCALL_ENTER); if (flags & _TIF_SYSCALL_EMU) return NO_SYSCALL; } /* Do the secure computing after ptrace; failures should be fast. */ if (secure_computing() == -1) return NO_SYSCALL; if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, regs->syscallno); audit_syscall_entry(regs->syscallno, regs->orig_x0, regs->regs[1], regs->regs[2], regs->regs[3]); return regs->syscallno; } void syscall_trace_exit(struct pt_regs *regs) { unsigned long flags = read_thread_flags(); audit_syscall_exit(regs); if (flags & _TIF_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) report_syscall(regs, PTRACE_SYSCALL_EXIT); rseq_syscall(regs); } /* * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a. * We permit userspace to set SSBS (AArch64 bit 12, AArch32 bit 23) which is * not described in ARM DDI 0487D.a. * We treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may * be allocated an EL0 meaning in future. * Userspace cannot use these until they have an architectural meaning. * Note that this follows the SPSR_ELx format, not the AArch32 PSR format. * We also reserve IL for the kernel; SS is handled dynamically. */ #define SPSR_EL1_AARCH64_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \ GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) static int valid_compat_regs(struct user_pt_regs *regs) { regs->pstate &= ~SPSR_EL1_AARCH32_RES0_BITS; if (!system_supports_mixed_endian_el0()) { if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) regs->pstate |= PSR_AA32_E_BIT; else regs->pstate &= ~PSR_AA32_E_BIT; } if (user_mode(regs) && (regs->pstate & PSR_MODE32_BIT) && (regs->pstate & PSR_AA32_A_BIT) == 0 && (regs->pstate & PSR_AA32_I_BIT) == 0 && (regs->pstate & PSR_AA32_F_BIT) == 0) { return 1; } /* * Force PSR to a valid 32-bit EL0t, preserving the same bits as * arch/arm. */ regs->pstate &= PSR_AA32_N_BIT | PSR_AA32_Z_BIT | PSR_AA32_C_BIT | PSR_AA32_V_BIT | PSR_AA32_Q_BIT | PSR_AA32_IT_MASK | PSR_AA32_GE_MASK | PSR_AA32_E_BIT | PSR_AA32_T_BIT; regs->pstate |= PSR_MODE32_BIT; return 0; } static int valid_native_regs(struct user_pt_regs *regs) { regs->pstate &= ~SPSR_EL1_AARCH64_RES0_BITS; if (user_mode(regs) && !(regs->pstate & PSR_MODE32_BIT) && (regs->pstate & PSR_D_BIT) == 0 && (regs->pstate & PSR_A_BIT) == 0 && (regs->pstate & PSR_I_BIT) == 0 && (regs->pstate & PSR_F_BIT) == 0) { return 1; } /* Force PSR to a valid 64-bit EL0t */ regs->pstate &= PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT; return 0; } /* * Are the current registers suitable for user mode? (used to maintain * security in signal handlers) */ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task) { /* https://lore.kernel.org/lkml/20191118131525.GA4180@willie-the-truck */ user_regs_reset_single_step(regs, task); if (is_compat_thread(task_thread_info(task))) return valid_compat_regs(regs); else return valid_native_regs(regs); }
2 8 7 348 89 348 89 83 85 161 109 95 109 43 109 109 8 50 58 20 17 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H #include <linux/mm.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/fs.h> #include <linux/hugetlb_inline.h> #include <linux/cgroup.h> #include <linux/page_ref.h> #include <linux/list.h> #include <linux/kref.h> #include <linux/pgtable.h> #include <linux/gfp.h> #include <linux/userfaultfd_k.h> struct ctl_table; struct user_struct; struct mmu_gather; struct node; void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE #include <linux/pagemap.h> #include <linux/shm.h> #include <asm/tlbflush.h> /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail * struct page to store the metadata. */ #define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; long count; long max_hpages; /* Maximum huge pages or -1 if no maximum. */ long used_hpages; /* Used count against maximum, includes */ /* both allocated and reserved pages. */ struct hstate *hstate; long min_hpages; /* Minimum huge pages or -1 if no minimum. */ long rsv_hpages; /* Pages reserved against global pool to */ /* satisfy minimum size. */ }; struct resv_map { struct kref refs; spinlock_t lock; struct list_head regions; long adds_in_progress; struct list_head region_cache; long region_cache_count; struct rw_semaphore rw_sema; #ifdef CONFIG_CGROUP_HUGETLB /* * On private mappings, the counter to uncharge reservations is stored * here. If these fields are 0, then either the mapping is shared, or * cgroup accounting is disabled for this resv_map. */ struct page_counter *reservation_counter; unsigned long pages_per_hpage; struct cgroup_subsys_state *css; #endif }; /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * * The region data structures are embedded into a resv_map and protected * by a resv_map's lock. The set of regions within the resv_map represent * reservations for huge pages, or huge pages that have already been * instantiated within the map. The from and to elements are huge page * indices into the associated mapping. from indicates the starting index * of the region. to represents the first index past the end of the region. * * For example, a file region structure with from == 0 and to == 4 represents * four huge pages in a mapping. It is important to note that the to element * represents the first element past the end of the region. This is used in * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. * * Interval notation of the form [from, to) will be used to indicate that * the endpoint from is inclusive and to is exclusive. */ struct file_region { struct list_head link; long from; long to; #ifdef CONFIG_CGROUP_HUGETLB /* * On shared mappings, each reserved region appears as a struct * file_region in resv_map. These fields hold the info needed to * uncharge each reservation. */ struct page_counter *reservation_counter; struct cgroup_subsys_state *css; #endif }; struct hugetlb_vma_lock { struct kref refs; struct rw_semaphore rw_sema; struct vm_area_struct *vma; }; extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); extern spinlock_t hugetlb_lock; extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); void hugetlb_show_meminfo_node(int nid); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud); bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address); struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); bool hugetlb_bootmem_allocated(void); /* arch callbacks */ #ifndef CONFIG_HIGHPTE /* * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures * which may go down to the lowest PTE level in their huge_pte_offset() and * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap(). */ static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address) { return pte_offset_kernel(pmd, address); } static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address); } #endif pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); /* * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * * IMPORTANT: we should normally not directly call this function, instead * this is only a common interface to implement arch-specific * walker. Please use hugetlb_walk() instead, because that will attempt to * verify the locking for you. * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be * responsible of its thread safety. One can follow this rule: * * (1) For private mappings: pmd unsharing is not possible, so holding the * mmap_lock for either read or write is sufficient. Most callers * already hold the mmap_lock, so normally, no special action is * required. * * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged * pgtable page can go away from under us! It can be done by a pmd * unshare with a follow up munmap() on the other process), then we * need either: * * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare * won't happen upon the range (it also makes sure the pte_t we * read is the right and stable one), or, * * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make * sure even if unshare happened the racy unmap() will wait until * i_mmap_rwsem is released. * * Option (2.1) is the safest, which guarantees pte stability from pmd * sharing pov, until the vma lock released. Option (2.2) doesn't protect * a concurrent pmd unshare, but it makes sure the pgtable page is safe to * access. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); extern void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *begin, unsigned long *end); extern void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details); static inline void hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_begin(vma, start, end); } static inline void hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { if (is_vm_hugetlb_page(vma)) __hugetlb_zap_end(vma, details); } void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); void hugetlb_vma_lock_write(struct vm_area_struct *vma); void hugetlb_vma_unlock_write(struct vm_area_struct *vma); int hugetlb_vma_trylock_write(struct vm_area_struct *vma); void hugetlb_vma_assert_locked(struct vm_area_struct *vma); void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { } static inline unsigned long hugetlb_total_pages(void) { return 0; } static inline struct address_space *hugetlb_folio_mapping_lock_write( struct folio *folio) { return NULL; } static inline int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_begin( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } static inline void hugetlb_zap_end( struct vm_area_struct *vma, struct zap_details *details) { } static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { BUG(); return 0; } static inline int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { BUG(); return 0; } static inline void hugetlb_report_meminfo(struct seq_file *m) { } static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid) { return 0; } static inline void hugetlb_show_meminfo_node(int nid) { } static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { return -EINVAL; } static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { } static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) { } static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { } static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { return 1; } static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { } static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { BUG(); } #ifdef CONFIG_USERFAULTFD static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { BUG(); return 0; } #endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { return NULL; } static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void folio_putback_hugetlb(struct folio *folio) { } static inline void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { } static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { return 0; } static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { BUG(); } static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { BUG(); return 0; } static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write static inline int pgd_write(pgd_t pgd) { BUG(); return 0; } #endif #define HUGETLB_ANON_FILE "anon_hugepage" enum { /* * The file will be used as an shm file so shmfs accounting rules * apply */ HUGETLB_SHMFS_INODE = 1, /* * The file is being created on the internal vfs mount and shmfs * accounting rules do not apply */ HUGETLB_ANONHUGE_INODE = 2, }; #ifdef CONFIG_HUGETLBFS struct hugetlbfs_sb_info { long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; struct hstate *hstate; struct hugepage_subpool *spool; kuid_t uid; kgid_t gid; umode_t mode; }; static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) { return sb->s_fs_info; } struct hugetlbfs_inode_info { struct inode vfs_inode; unsigned int seals; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) { return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, int creat_flags, int page_size_log); static inline bool is_file_hugepages(const struct file *file) { return file->f_op->fop_flags & FOP_HUGE_PAGES; } static inline struct hstate *hstate_inode(struct inode *i) { return HUGETLBFS_SB(i->i_sb)->hstate; } #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) false static inline struct file * hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, int creat_flags, int page_size_log) { return ERR_PTR(-ENOSYS); } static inline struct hstate *hstate_inode(struct inode *i) { return NULL; } #endif /* !CONFIG_HUGETLBFS */ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be * used to manipulate these flags. * * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. * Synchronization: Examined or modified by code that knows it has * the only reference to page. i.e. After allocation but before use * or when the page is being freed. * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. * Synchronization: Initially set after new page allocation with no * locking. When examined and modified during migration processing * (isolate, migrate, putback) the hugetlb_lock is held. * HPG_temporary - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. * Synchronization: Can be set after huge page allocation from buddy when * code knows it has only reference. All other examinations and * modifications require hugetlb_lock. * HPG_freed - Set when page is on the free lists. * Synchronization: hugetlb_lock held for examination and modification. * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed. * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page * that is not tracked by raw_hwp_page list. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, HPG_temporary, HPG_freed, HPG_vmemmap_optimized, HPG_raw_hwp_unreliable, HPG_cma, __NR_HPAGEFLAGS, }; /* * Macros to create test, set and clear function definitions for * hugetlb specific page flags. */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ { } #endif #define HPAGEFLAG(uname, flname) \ TESTHPAGEFLAG(uname, flname) \ SETHPAGEFLAG(uname, flname) \ CLEARHPAGEFLAG(uname, flname) \ /* * Create functions associated with hugetlb page flags */ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) HPAGEFLAG(Cma, cma) #ifdef CONFIG_HUGETLB_PAGE #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { struct mutex resize_lock; struct lock_class_key resize_key; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; unsigned int demote_order; unsigned long mask; unsigned long max_huge_pages; unsigned long nr_huge_pages; unsigned long free_huge_pages; unsigned long resv_huge_pages; unsigned long surplus_huge_pages; unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; char name[HSTATE_NAME_LEN]; }; struct cma; struct huge_bootmem_page { struct list_head list; struct hstate *hstate; unsigned long flags; struct cma *cma; }; #define HUGE_BOOTMEM_HVO 0x0001 #define HUGE_BOOTMEM_ZONES_VALID 0x0002 #define HUGE_BOOTMEM_CMA 0x0004 bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); int __init alloc_bootmem_huge_page(struct hstate *h, int nid); bool __init hugetlb_node_alloc_supported(void); void __init hugetlb_add_hstate(unsigned order); bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); #ifndef HUGE_MAX_HSTATE #define HUGE_MAX_HSTATE 1 #endif extern struct hstate hstates[HUGE_MAX_HSTATE]; extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return folio->_hugetlb_subpool; } static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { folio->_hugetlb_subpool = subpool; } static inline struct hstate *hstate_file(struct file *f) { return hstate_inode(file_inode(f)); } static inline struct hstate *hstate_sizelog(int page_size_log) { if (!page_size_log) return &default_hstate; if (page_size_log < BITS_PER_LONG) return size_to_hstate(1UL << page_size_log); return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return hstate_file(vma->vm_file); } static inline unsigned long huge_page_size(const struct hstate *h) { return (unsigned long)PAGE_SIZE << h->order; } extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma); extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma); static inline unsigned long huge_page_mask(struct hstate *h) { return h->mask; } static inline unsigned int huge_page_order(struct hstate *h) { return h->order; } static inline unsigned huge_page_shift(struct hstate *h) { return h->order + PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) { return 1 << h->order; } static inline unsigned int blocks_per_huge_page(struct hstate *h) { return huge_page_size(h) / 512; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return filemap_lock_folio(mapping, idx << huge_page_order(h)); } #include <asm/hugetlb.h> #ifndef is_hugepage_only_range static inline int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { return 0; } #define is_hugepage_only_range is_hugepage_only_range #endif #ifndef arch_clear_hugetlb_flags static inline void arch_clear_hugetlb_flags(struct folio *folio) { } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags #endif #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { return pte_mkhuge(entry); } #endif #ifndef arch_has_huge_bootmem_alloc /* * Some architectures do their own bootmem allocation, so they can't use * early CMA allocation. */ static inline bool arch_has_huge_bootmem_alloc(void) { return false; } #endif static inline struct hstate *folio_hstate(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); return size_to_hstate(folio_size(folio)); } static inline unsigned hstate_index_to_shift(unsigned index) { return hstates[index].order + PAGE_SHIFT; } static inline int hstate_index(struct hstate *h) { return h - hstates; } int dissolve_free_hugetlb_folio(struct folio *folio); int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION #ifndef arch_hugetlb_migration_supported static inline bool arch_hugetlb_migration_supported(struct hstate *h) { if ((huge_page_shift(h) == PMD_SHIFT) || (huge_page_shift(h) == PUD_SHIFT) || (huge_page_shift(h) == PGDIR_SHIFT)) return true; else return false; } #endif #else static inline bool arch_hugetlb_migration_supported(struct hstate *h) { return false; } #endif static inline bool hugepage_migration_supported(struct hstate *h) { return arch_hugetlb_migration_supported(h); } /* * Movability check is different as compared to migration check. * It determines whether or not a huge page should be placed on * movable zone or not. Movability of any huge page should be * required only if huge page size is supported for migration. * There won't be any reason for the huge page to be movable if * it is not migratable to start with. Also the size of the huge * page should be large enough to be placed under a movable zone * and still feasible enough to be migratable. Just the presence * in movable zone does not make the migration feasible. * * So even though large huge page sizes like the gigantic ones * are migratable they should not be movable because its not * feasible to migrate them from movable zone. */ static inline bool hugepage_movable_supported(struct hstate *h) { if (!hugepage_migration_supported(h)) return false; if (hstate_is_gigantic(h)) return false; return true; } /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { gfp_t gfp = __GFP_COMP | __GFP_NOWARN; gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER; return gfp; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { gfp_t modified_mask = htlb_alloc_mask(h); /* Some callers might want to enforce node */ modified_mask |= (gfp_mask & __GFP_THISNODE); modified_mask |= (gfp_mask & __GFP_NOWARN); return modified_mask; } static inline bool htlb_allow_alloc_fallback(int reason) { bool allowed_fallback = false; /* * Note: the memory offline, memory failure and migration syscalls will * be allowed to fallback to other nodes due to lack of a better chioce, * that might break the per-node hugetlb pool. While other cases will * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool. */ switch (reason) { case MR_MEMORY_HOTPLUG: case MR_MEMORY_FAILURE: case MR_SYSCALL: case MR_MEMPOLICY_MBIND: allowed_fallback = true; break; default: break; } return allowed_fallback; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { const unsigned long size = huge_page_size(h); VM_WARN_ON(size == PAGE_SIZE); /* * hugetlb must use the exact same PT locks as core-mm page table * walkers would. When modifying a PTE table, hugetlb must take the * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD * PT lock etc. * * The expectation is that any hugetlb folio smaller than a PMD is * always mapped into a single PTE table and that any hugetlb folio * smaller than a PUD (but at least as big as a PMD) is always mapped * into a single PMD table. * * If that does not hold for an architecture, then that architecture * must disable split PT locks such that all *_lockptr() functions * will give us the same result: the per-MM PT lock. * * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() * and core-mm would use pmd_lockptr(). However, in such configurations * split PMD locks are disabled -- they don't make sense on a single * PGDIR page table -- and the end result is the same. */ if (size >= PUD_SIZE) return pud_lockptr(mm, (pud_t *) pte); else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) return pmd_lockptr(mm, (pmd_t *) pte); /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ return ptep_lockptr(mm, pte); } #ifndef hugepages_supported /* * Some platform decide whether they support huge pages at boot * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0 * when there is no such support */ #define hugepages_supported() (HPAGE_SHIFT != 0) #endif void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); static inline void hugetlb_count_init(struct mm_struct *mm) { atomic_long_set(&mm->hugetlb_usage, 0); } static inline void hugetlb_count_add(long l, struct mm_struct *mm) { atomic_long_add(l, &mm->hugetlb_usage); } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { atomic_long_sub(l, &mm->hugetlb_usage); } #ifndef huge_ptep_modify_prot_start #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { unsigned long psize = huge_page_size(hstate_vma(vma)); return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } #endif #ifndef huge_ptep_modify_prot_commit #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { unsigned long psize = huge_page_size(hstate_vma(vma)); set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } #endif #ifdef CONFIG_NUMA void hugetlb_register_node(struct node *node); void hugetlb_unregister_node(struct node *node); #endif /* * Check if a given raw @page in a hugepage is HWPOISON. */ bool is_raw_hwpoison_page_in_hugepage(struct page *page); static inline unsigned long huge_page_mask_align(struct file *file) { return PAGE_MASK & ~huge_page_mask(hstate_file(file)); } #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; static inline unsigned long huge_page_mask_align(struct file *file) { return 0; } static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return NULL; } static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t idx) { return NULL; } static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { return -ENOMEM; } static inline int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { return 0; } static inline void wait_for_freed_hugetlb_folios(void) { } static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) { return NULL; } static inline struct folio * alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; } static inline struct folio * alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { return NULL; } static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; } static inline struct hstate *hstate_file(struct file *f) { return NULL; } static inline struct hstate *hstate_sizelog(int page_size_log) { return NULL; } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) { return NULL; } static inline struct hstate *folio_hstate(struct folio *folio) { return NULL; } static inline struct hstate *size_to_hstate(unsigned long size) { return NULL; } static inline unsigned long huge_page_size(struct hstate *h) { return PAGE_SIZE; } static inline unsigned long huge_page_mask(struct hstate *h) { return PAGE_MASK; } static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return PAGE_SIZE; } static inline unsigned int huge_page_order(struct hstate *h) { return 0; } static inline unsigned int huge_page_shift(struct hstate *h) { return PAGE_SHIFT; } static inline bool hstate_is_gigantic(struct hstate *h) { return false; } static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; } static inline unsigned hstate_index_to_shift(unsigned index) { return 0; } static inline int hstate_index(struct hstate *h) { return 0; } static inline int dissolve_free_hugetlb_folio(struct folio *folio) { return 0; } static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { return 0; } static inline bool hugepage_migration_supported(struct hstate *h) { return false; } static inline bool hugepage_movable_supported(struct hstate *h) { return false; } static inline gfp_t htlb_alloc_mask(struct hstate *h) { return 0; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) { return 0; } static inline bool htlb_allow_alloc_fallback(int reason) { return false; } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void hugetlb_count_init(struct mm_struct *mm) { } static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) { } static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_MMU return ptep_get(ptep); #else return *ptep; #endif } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { } static inline void hugetlb_register_node(struct node *node) { } static inline void hugetlb_unregister_node(struct node *node) { } static inline bool hugetlbfs_pagecache_present( struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return false; } static inline void hugetlb_bootmem_alloc(void) { } static inline bool hugetlb_bootmem_allocated(void) { return false; } #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { spinlock_t *ptl; ptl = huge_pte_lockptr(h, mm, pte); spin_lock(ptl); return ptl; } #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) extern void __init hugetlb_cma_reserve(int order); #else static inline __init void hugetlb_cma_reserve(int order) { } #endif #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { return page_count(virt_to_page(pte)) > 1; } #else static inline bool hugetlb_pmd_shared(pte_t *pte) { return false; } #endif bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE /* * ARCHes with special requirements for evicting HUGETLB backing TLB entries can * implement this. */ #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif static inline bool __vma_shareable_lock(struct vm_area_struct *vma) { return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } bool __vma_private_lock(struct vm_area_struct *vma); /* * Safe version of huge_pte_offset() to check the locks. See comments * above huge_pte_offset(). */ static inline pte_t * hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { #if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; /* * If pmd sharing possible, locking needed to safely walk the * hugetlb pgtables. More information can be found at the comment * above huge_pte_offset() in the same file. * * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. */ if (__vma_shareable_lock(vma)) WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && !lockdep_is_held( &vma->vm_file->f_mapping->i_mmap_rwsem)); #endif return huge_pte_offset(vma->vm_mm, addr, sz); } #endif /* _LINUX_HUGETLB_H */
109 133 134 47 18 19 30 46 19 19 30 11 17 18 7 11 71 11 17 88 30 63 59 105 77 77 77 77 77 77 77 77 77 77 77 95 95 45 3 3 163 163 95 95 95 94 23 45 3 46 43 85 84 84 109 58 51 43 43 43 66 66 66 66 66 66 66 66 66 43 43 29 14 14 43 43 43 43 43 43 43 43 43 43 43 95 95 66 27 27 27 5 5 26 27 27 27 27 10 18 18 27 27 89 67 27 27 27 87 67 27 27 43 43 6 14 29 43 49 49 1 49 49 49 49 163 159 163 63 66 63 3 66 66 66 63 3 3 94 95 30 25 5 27 18 18 18 4 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 // SPDX-License-Identifier: GPL-2.0-only /* * Generic hugetlb support. * (C) Nadia Yvette Chambers, April 2004 */ #include <linux/list.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/sysctl.h> #include <linux/highmem.h> #include <linux/mmu_notifier.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/compiler.h> #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/memblock.h> #include <linux/minmax.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> #include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> #include <linux/numa.h> #include <linux/llist.h> #include <linux/cma.h> #include <linux/migrate.h> #include <linux/nospec.h> #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> #include <linux/padata.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/setup.h> #include <linux/io.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" #include "hugetlb_cma.h" #include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; /* * Due to ordering constraints across the init code for various * architectures, hugetlb hstate cmdline parameters can't simply * be early_param. early_param might call the setup function * before valid hugetlb page sizes are determined, leading to * incorrect rejection of valid hugepagesz= options. * * So, record the parameters early and consume them whenever the * init code is ready for them, by calling hugetlb_parse_params(). */ /* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */ #define HUGE_MAX_CMDLINE_ARGS (2 * HUGE_MAX_HSTATE + 1) struct hugetlb_cmdline { char *val; int (*setup)(char *val); }; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static bool __initdata parsed_valid_hugepagesz = true; static bool __initdata parsed_default_hugepagesz; static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; static unsigned long hugepage_allocation_threads __initdata; static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata; static int hstate_cmdline_index __initdata; static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata; static int hugetlb_param_index __initdata; static __init int hugetlb_add_param(char *s, int (*setup)(char *val)); static __init void hugetlb_parse_params(void); #define hugetlb_early_param(str, func) \ static __init int func##args(char *s) \ { \ return hugetlb_add_param(s, func); \ } \ early_param(str, func##args) /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ static int num_fault_mutexes __ro_after_init; struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static void hugetlb_free_folio(struct folio *folio) { if (folio_test_hugetlb_cma(folio)) { hugetlb_cma_free_folio(folio); return; } folio_put(folio); } static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) return false; if (spool->max_hpages != -1) return spool->used_hpages == 0; if (spool->min_hpages != -1) return spool->rsv_hpages == spool->min_hpages; return true; } static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) { spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and * free the subpool */ if (subpool_is_free(spool)) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, -spool->min_hpages); kfree(spool); } } struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages) { struct hugepage_subpool *spool; spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; spool->max_hpages = max_hpages; spool->hstate = h; spool->min_hpages = min_hpages; if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { kfree(spool); return NULL; } spool->rsv_hpages = min_hpages; return spool; } void hugepage_put_subpool(struct hugepage_subpool *spool) { unsigned long flags; spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; unlock_or_release_subpool(spool, flags); } /* * Subpool accounting for allocating and reserving pages. * Return -ENOMEM if there are not enough resources to satisfy the * request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; if (!spool) return ret; spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) spool->used_hpages += delta; else { ret = -ENOMEM; goto unlock_ret; } } /* minimum size accounting */ if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on * behalf of subpool. Return difference. */ ret = delta - spool->rsv_hpages; spool->rsv_hpages = 0; } else { ret = 0; /* reserves already accounted for */ spool->rsv_hpages -= delta; } } unlock_ret: spin_unlock_irq(&spool->lock); return ret; } /* * Subpool accounting for freeing and unreserving pages. * Return the number of global page reservations that must be dropped. * The return value may only be different than the passed value (delta) * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; unsigned long flags; if (!spool) return delta; spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; /* minimum size accounting */ if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else ret = spool->rsv_hpages + delta - spool->min_hpages; spool->rsv_hpages += delta; if (spool->rsv_hpages > spool->min_hpages) spool->rsv_hpages = spool->min_hpages; } /* * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ unlock_or_release_subpool(spool, flags); return ret; } static inline struct hugepage_subpool *subpool_inode(struct inode *inode) { return HUGETLBFS_SB(inode->i_sb)->spool; } static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { return subpool_inode(file_inode(vma->vm_file)); } /* * hugetlb vma_lock helper routines */ void hugetlb_vma_lock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_read(&resv_map->rw_sema); } } void hugetlb_vma_unlock_read(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_read(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_read(&resv_map->rw_sema); } } void hugetlb_vma_lock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); down_write(&resv_map->rw_sema); } } void hugetlb_vma_unlock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_write(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); up_write(&resv_map->rw_sema); } } int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; return down_write_trylock(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); return down_write_trylock(&resv_map->rw_sema); } return 1; } void hugetlb_vma_assert_locked(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; lockdep_assert_held(&vma_lock->rw_sema); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); lockdep_assert_held(&resv_map->rw_sema); } } void hugetlb_vma_lock_release(struct kref *kref) { struct hugetlb_vma_lock *vma_lock = container_of(kref, struct hugetlb_vma_lock, refs); kfree(vma_lock); } static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; /* * vma_lock structure may or not be released as a result of put, * it certainly will no longer be attached to vma so clear pointer. * Semaphore synchronizes access to vma_lock->vma field. */ vma_lock->vma = NULL; vma->vm_private_data = NULL; up_write(&vma_lock->rw_sema); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); } static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) { if (__vma_shareable_lock(vma)) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; __hugetlb_vma_unlock_write_put(vma_lock); } else if (__vma_private_lock(vma)) { struct resv_map *resv_map = vma_resv_map(vma); /* no free for anon vmas, but still need to unlock */ up_write(&resv_map->rw_sema); } } static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* * Only present in sharable vmas. */ if (!vma || !__vma_shareable_lock(vma)) return; if (vma->vm_private_data) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); __hugetlb_vma_unlock_write_put(vma_lock); } } static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) return; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) return; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { /* * If we can not allocate structure, then vma can not * participate in pmd sharing. This is only a possible * performance enhancement and memory saving issue. * However, the lock is also used to synchronize page * faults with truncation. If the lock is not present, * unlikely races could leave pages in a file past i_size * until the file is removed. Warn in the unlikely case of * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); return; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; } /* Helper that removes a struct file_region from the resv_map cache and returns * it for use. */ static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); resv->region_cache_count--; nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); nrg->from = from; nrg->to = to; return nrg; } static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB nrg->reservation_counter = rg->reservation_counter; nrg->css = rg->css; if (rg->css) css_get(rg->css); #endif } /* Helper that records hugetlb_cgroup uncharge info. */ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, struct hstate *h, struct resv_map *resv, struct file_region *nrg) { #ifdef CONFIG_CGROUP_HUGETLB if (h_cg) { nrg->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; nrg->css = &h_cg->css; /* * The caller will hold exactly one h_cg->css reference for the * whole contiguous reservation region. But this area might be * scattered when there are already some file_regions reside in * it. As a result, many file_regions may share only one css * reference. In order to ensure that one file_region must hold * exactly one h_cg->css reference, we should do css_get for * each file_region and leave the reference held by caller * untouched. */ css_get(&h_cg->css); if (!resv->pages_per_hpage) resv->pages_per_hpage = pages_per_huge_page(h); /* pages_per_hpage should be the same for all entries in * a resv_map. */ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); } else { nrg->reservation_counter = NULL; nrg->css = NULL; } #endif } static void put_uncharge_info(struct file_region *rg) { #ifdef CONFIG_CGROUP_HUGETLB if (rg->css) css_put(rg->css); #endif } static bool has_same_uncharge_info(struct file_region *rg, struct file_region *org) { #ifdef CONFIG_CGROUP_HUGETLB return rg->reservation_counter == org->reservation_counter && rg->css == org->css; #else return true; #endif } static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && has_same_uncharge_info(prg, rg)) { prg->to = rg->to; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); rg = prg; } nrg = list_next_entry(rg, link); if (&nrg->link != &resv->regions && nrg->from == rg->to && has_same_uncharge_info(nrg, rg)) { nrg->from = rg->from; list_del(&rg->link); put_uncharge_info(rg); kfree(rg); } } static inline long hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, long to, struct hstate *h, struct hugetlb_cgroup *cg, long *regions_needed) { struct file_region *nrg; if (!regions_needed) { nrg = get_file_region_entry_from_cache(map, from, to); record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); list_add(&nrg->link, rg); coalesce_file_region(map, nrg); } else *regions_needed += 1; return to - from; } /* * Must be called with resv->lock held. * * Calling this with regions_needed != NULL will count the number of pages * to be added but will not modify the linked list. And regions_needed will * indicate the number of file_regions needed in the cache to carry out to add * the regions for this range. */ static long add_reservation_in_range(struct resv_map *resv, long f, long t, struct hugetlb_cgroup *h_cg, struct hstate *h, long *regions_needed) { long add = 0; struct list_head *head = &resv->regions; long last_accounted_offset = f; struct file_region *iter, *trg = NULL; struct list_head *rg = NULL; if (regions_needed) *regions_needed = 0; /* In this loop, we essentially handle an entry for the range * [last_accounted_offset, iter->from), at every iteration, with some * bounds checking. */ list_for_each_entry_safe(iter, trg, head, link) { /* Skip irrelevant regions that start before our range. */ if (iter->from < f) { /* If this region ends after the last accounted offset, * then we need to update last_accounted_offset. */ if (iter->to > last_accounted_offset) last_accounted_offset = iter->to; continue; } /* When we find a region that starts beyond our range, we've * finished. */ if (iter->from >= t) { rg = iter->link.prev; break; } /* Add an entry for last_accounted_offset -> iter->from, and * update last_accounted_offset. */ if (iter->from > last_accounted_offset) add += hugetlb_resv_map_add(resv, iter->link.prev, last_accounted_offset, iter->from, h, h_cg, regions_needed); last_accounted_offset = iter->to; } /* Handle the case where our range extends beyond * last_accounted_offset. */ if (!rg) rg = head->prev; if (last_accounted_offset < t) add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, t, h, h_cg, regions_needed); return add; } /* Must be called with resv->lock acquired. Will drop lock to allocate entries. */ static int allocate_file_region_entries(struct resv_map *resv, int regions_needed) __must_hold(&resv->lock) { LIST_HEAD(allocated_regions); int to_allocate = 0, i = 0; struct file_region *trg = NULL, *rg = NULL; VM_BUG_ON(regions_needed < 0); /* * Check for sufficient descriptors in the cache to accommodate * the number of in progress add operations plus regions_needed. * * This is a while loop because when we drop the lock, some other call * to region_add or region_del may have consumed some region_entries, * so we keep looping here until we finally have enough entries for * (adds_in_progress + regions_needed). */ while (resv->region_cache_count < (resv->adds_in_progress + regions_needed)) { to_allocate = resv->adds_in_progress + regions_needed - resv->region_cache_count; /* At this point, we should have enough entries in the cache * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed. */ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); spin_unlock(&resv->lock); for (i = 0; i < to_allocate; i++) { trg = kmalloc(sizeof(*trg), GFP_KERNEL); if (!trg) goto out_of_memory; list_add(&trg->link, &allocated_regions); } spin_lock(&resv->lock); list_splice(&allocated_regions, &resv->region_cache); resv->region_cache_count += to_allocate; } return 0; out_of_memory: list_for_each_entry_safe(rg, trg, &allocated_regions, link) { list_del(&rg->link); kfree(rg); } return -ENOMEM; } /* * Add the huge page range represented by [f, t) to the reserve * map. Regions will be taken from the cache to fill in this range. * Sufficient regions should exist in the cache due to the previous * call to region_chg with the same range, but in some cases the cache will not * have sufficient entries due to races with other code doing region_add or * region_del. The extra needed entries will be allocated. * * regions_needed is the out value provided by a previous call to region_chg. * * Return the number of new huge pages added to the map. This number is greater * than or equal to zero. If file_region entries needed to be allocated for * this operation and we were not able to allocate, it returns -ENOMEM. * region_add of regions of length 1 never allocate file_regions and cannot * fail; region_chg will always allocate at least 1 entry and a region_add for * 1 page will only require at most 1 entry. */ static long region_add(struct resv_map *resv, long f, long t, long in_regions_needed, struct hstate *h, struct hugetlb_cgroup *h_cg) { long add = 0, actual_regions_needed = 0; spin_lock(&resv->lock); retry: /* Count how many regions are actually needed to execute this add. */ add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed); /* * Check for sufficient descriptors in the cache to accommodate * this add operation. Note that actual_regions_needed may be greater * than in_regions_needed, as the resv_map may have been modified since * the region_chg call. In this case, we need to make sure that we * allocate extra entries, such that we have enough for all the * existing adds_in_progress, plus the excess needed for this * operation. */ if (actual_regions_needed > in_regions_needed && resv->region_cache_count < resv->adds_in_progress + (actual_regions_needed - in_regions_needed)) { /* region_add operation of range 1 should never need to * allocate file_region entries. */ VM_BUG_ON(t - f <= 1); if (allocate_file_region_entries( resv, actual_regions_needed - in_regions_needed)) { return -ENOMEM; } goto retry; } add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); return add; } /* * Examine the existing reserve map and determine how many * huge pages in the specified range [f, t) are NOT currently * represented. This routine is called before a subsequent * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the * map. A number of new file_region structures is added to the cache as a * placeholder, for the subsequent region_add call to use. At least 1 * file_region structure is added. * * out_regions_needed is the number of regions added to the * resv->adds_in_progress. This value needs to be provided to a follow up call * to region_add or region_abort for proper accounting. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to * zero. -ENOMEM is returned if a new file_region structure or cache entry * is needed and can not be allocated. */ static long region_chg(struct resv_map *resv, long f, long t, long *out_regions_needed) { long chg = 0; spin_lock(&resv->lock); /* Count how many hugepages in this range are NOT represented. */ chg = add_reservation_in_range(resv, f, t, NULL, NULL, out_regions_needed); if (*out_regions_needed == 0) *out_regions_needed = 1; if (allocate_file_region_entries(resv, *out_regions_needed)) return -ENOMEM; resv->adds_in_progress += *out_regions_needed; spin_unlock(&resv->lock); return chg; } /* * Abort the in progress add operation. The adds_in_progress field * of the resv_map keeps track of the operations in progress between * calls to region_chg and region_add. Operations are sometimes * aborted after the call to region_chg. In such cases, region_abort * is called to decrement the adds_in_progress counter. regions_needed * is the value returned by the region_chg call, it is used to decrement * the adds_in_progress counter. * * NOTE: The range arguments [f, t) are not needed or used in this * routine. They are kept to make reading the calling code easier as * arguments will match the associated region_chg call. */ static void region_abort(struct resv_map *resv, long f, long t, long regions_needed) { spin_lock(&resv->lock); VM_BUG_ON(!resv->region_cache_count); resv->adds_in_progress -= regions_needed; spin_unlock(&resv->lock); } /* * Delete the specified range [f, t) from the reserve map. If the * t parameter is LONG_MAX, this indicates that ALL regions after f * should be deleted. Locate the regions which intersect [f, t) * and either trim, delete or split the existing regions. * * Returns the number of huge pages deleted from the reserve map. * In the normal case, the return value is zero or more. In the * case where a region must be split, a new region descriptor must * be allocated. If the allocation fails, -ENOMEM will be returned. * NOTE: If the parameter t == LONG_MAX, then we will never split * a region and possibly return -ENOMEM. Callers specifying * t == LONG_MAX do not need to check for -ENOMEM error. */ static long region_del(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *trg; struct file_region *nrg = NULL; long del = 0; retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { /* * Skip regions before the range to be deleted. file_region * ranges are normally of the form [from, to). However, there * may be a "placeholder" entry in the map which is of the form * (from, to) with from == to. Check for placeholder entries * at the beginning of the range to be deleted. */ if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; if (rg->from >= t) break; if (f > rg->from && t < rg->to) { /* Must split region */ /* * Check for an entry in the cache before dropping * lock and attempting allocation. */ if (!nrg && resv->region_cache_count > resv->adds_in_progress) { nrg = list_first_entry(&resv->region_cache, struct file_region, link); list_del(&nrg->link); resv->region_cache_count--; } if (!nrg) { spin_unlock(&resv->lock); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) return -ENOMEM; goto retry; } del += t - f; hugetlb_cgroup_uncharge_file_region( resv, rg, t - f, false); /* New entry for end of split region */ nrg->from = t; nrg->to = rg->to; copy_hugetlb_cgroup_uncharge_info(nrg, rg); INIT_LIST_HEAD(&nrg->link); /* Original entry is trimmed */ rg->to = f; list_add(&nrg->link, &rg->link); nrg = NULL; break; } if (f <= rg->from && t >= rg->to) { /* Remove entire region */ del += rg->to - rg->from; hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - rg->from, true); list_del(&rg->link); kfree(rg); continue; } if (f <= rg->from) { /* Trim beginning of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, t - rg->from, false); del += t - rg->from; rg->from = t; } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - f, false); del += rg->to - f; rg->to = f; } } spin_unlock(&resv->lock); kfree(nrg); return del; } /* * A rare out of memory error was encountered which prevented removal of * the reserve map region for a page. The huge page itself was free'ed * and removed from the page cache. This routine will adjust the subpool * usage count, and the global reserve count if needed. By incrementing * these counts, the reserve map entry which could not be deleted will * appear as a "reserved" entry instead of simply dangling with incorrect * counts. */ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); if (!hugetlb_acct_memory(h, 1)) reserved = true; } else if (!rsv_adjust) { reserved = true; } if (!reserved) pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* * Count and return the number of huge pages in the reserve map * that intersect with the range [f, t). */ static long region_count(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; long seg_to; if (rg->to <= f) continue; if (rg->from >= t) break; seg_from = max(rg->from, f); seg_to = min(rg->to, t); chg += seg_to - seg_from; } spin_unlock(&resv->lock); return chg; } /* * Convert the address within this vma to the page offset within * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { return ((address - vma->vm_start) >> huge_page_shift(h)) + (vma->vm_pgoff >> huge_page_order(h)); } /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. * * Folios in this VMA will be aligned to, and at least the size of the * number of bytes returned by this function. * * Return: The default size of the folios allocated when backing a VMA. */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->pagesize) return vma->vm_ops->pagesize(vma); return PAGE_SIZE; } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On * architectures where it differs, an architecture-specific 'strong' * version of this symbol is required. */ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return vma_kernel_pagesize(vma); } /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. */ #define HPAGE_RESV_OWNER (1UL << 0) #define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. * * The private mapping reservation is represented in a subtly different * manner to a shared mapping. A shared mapping has a region map associated * with the underlying file, this region map represents the backing file * pages which have ever had a reservation assigned which this persists even * after the page is instantiated. A private mapping has a region map * associated with the original mmap which is attached to all VMAs which * reference it, this region map represents those offsets which have consumed * reservation ie. where pages have been instantiated. */ static unsigned long get_vma_private_data(struct vm_area_struct *vma) { return (unsigned long)vma->vm_private_data; } static void set_vma_private_data(struct vm_area_struct *vma, unsigned long value) { vma->vm_private_data = (void *)value; } static void resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, struct hugetlb_cgroup *h_cg, struct hstate *h) { #ifdef CONFIG_CGROUP_HUGETLB if (!h_cg || !h) { resv_map->reservation_counter = NULL; resv_map->pages_per_hpage = 0; resv_map->css = NULL; } else { resv_map->reservation_counter = &h_cg->rsvd_hugepage[hstate_index(h)]; resv_map->pages_per_hpage = pages_per_huge_page(h); resv_map->css = &h_cg->css; } #endif } struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); if (!resv_map || !rg) { kfree(resv_map); kfree(rg); return NULL; } kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); init_rwsem(&resv_map->rw_sema); resv_map->adds_in_progress = 0; /* * Initialize these to 0. On shared mappings, 0's here indicate these * fields don't do cgroup accounting. On private mappings, these will be * re-initialized to the proper values, to indicate that hugetlb cgroup * reservations are to be un-charged from here. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); INIT_LIST_HEAD(&resv_map->region_cache); list_add(&rg->link, &resv_map->region_cache); resv_map->region_cache_count = 1; return resv_map; } void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); struct list_head *head = &resv_map->region_cache; struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ region_del(resv_map, 0, LONG_MAX); /* ... and any entries left in the cache */ list_for_each_entry_safe(rg, trg, head, link) { list_del(&rg->link); kfree(rg); } VM_BUG_ON(resv_map->adds_in_progress); kfree(resv_map); } static inline struct resv_map *inode_resv_map(struct inode *inode) { /* * At inode evict time, i_mapping may not point to the original * address space within the inode. This original address space * contains the pointer to the resv_map. So, always use the * address space embedded within the inode. * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes. */ return (struct resv_map *)(&inode->i_data)->i_private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); if (vma->vm_flags & VM_MAYSHARE) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; return inode_resv_map(inode); } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, (unsigned long)map); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); return (get_vma_private_data(vma) & flag) != 0; } bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && get_vma_private_data(vma) & ~HPAGE_RESV_MASK && is_vma_resv_set(vma, HPAGE_RESV_OWNER); } void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data * - For shared mappings this is a per-vma semaphore that may be * allocated in a subsequent call to hugetlb_vm_op_open. * Before clearing, make sure pointer is not associated with vma * as this will leak the structure. This is the case when called * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock && vma_lock->vma != vma) vma->vm_private_data = NULL; } else vma->vm_private_data = NULL; } /* * Reset and decrement one ref on hugepage private reservation. * Called with mm->mmap_lock writer semaphore held. * This function should be only used by move_vma() and operate on * same sized vma. It should never come here with last ref on the * reservation. */ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) { /* * Clear the old hugetlb private page reservation. * It has already been transferred to new_vma. * * During a mremap() operation of a hugetlb vma we call move_vma() * which copies vma into new_vma and unmaps vma. After the copy * operation both new_vma and vma share a reference to the resv_map * struct, and at that point vma is about to be unmapped. We don't * want to return the reservation to the pool at unmap of vma because * the reservation still lives on in new_vma, so simply decrement the * ref here and remove the resv_map reference from this vma. */ struct resv_map *reservations = vma_resv_map(vma); if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_put_hugetlb_cgroup_uncharge_info(reservations); kref_put(&reservations->refs, resv_map_release); } hugetlb_dup_vma_private(vma); } static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; folio_set_hugetlb_freed(folio); } static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, int nid) { struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { if (pin && !folio_is_longterm_pinnable(folio)) continue; if (folio_test_hwpoison(folio)) continue; if (is_migrate_isolate_page(&folio->page)) continue; list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return folio; } return NULL; } static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; int node = NUMA_NO_NODE; /* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */ if (nid == NUMA_NO_NODE) nid = numa_node_id(); zonelist = node_zonelist(nid, gfp_mask); retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; /* * no need to ask again on the same node. Pool is node rather than * zone aware */ if (zone_to_nid(zone) == node) continue; node = zone_to_nid(zone); folio = dequeue_hugetlb_folio_node_exact(h, node); if (folio) return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return NULL; } static unsigned long available_huge_pages(struct hstate *h) { return h->free_huge_pages - h->resv_huge_pages; } static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; int nid; /* * gbl_chg==1 means the allocation requires a new page that was not * reserved before. Making sure there's at least one free page. */ if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return folio; err: return NULL; } /* * common helper functions for hstate_next_node_to_{alloc|free}. * We may have allocated or freed a huge page based on a different * nodes_allowed previously, so h->next_node_to_{alloc|free} might * be outside of *nodes_allowed. Ensure that we use an allowed * node for alloc or free. */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; } static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) { if (!node_isset(nid, *nodes_allowed)) nid = next_node_allowed(nid, nodes_allowed); return nid; } /* * returns the previously saved node ["this node"] from which to * allocate a persistent huge page for the pool and advance the * next node from which to allocate, handling wrap at end of node * mask. */ static int hstate_next_node_to_alloc(int *next_node, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); nid = get_valid_node_allowed(*next_node, nodes_allowed); *next_node = next_node_allowed(nid, nodes_allowed); return nid; } /* * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. */ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); return nid; } #define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ nr_nodes--) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; int order = huge_page_order(h); bool retried = false; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); if (!folio) { if (hugetlb_cma_exclusive_alloc()) return NULL; folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); if (!folio) return NULL; } if (folio_ref_freeze(folio, 1)) return folio; pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); hugetlb_free_folio(folio); if (!retried) { retried = true; goto retry; } return NULL; } #else /* !CONFIG_CONTIG_ALLOC */ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } #endif /* * Remove hugetlb folio from lists. * If vmemmap exists for the folio, clear the hugetlb flag so that the * folio appears as just a compound page. Otherwise, wait until after * allocating vmemmap to clear the flag. * * Must be called with hugetlb lock held. */ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int nid = folio_nid(folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; list_del(&folio->lru); if (folio_test_hugetlb_freed(folio)) { folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; } if (adjust_surplus) { h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } /* * We can only clear the hugetlb flag after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages. */ if (!folio_test_hugetlb_vmemmap_optimized(folio)) __folio_clear_hugetlb(folio); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int nid = folio_nid(folio); VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; if (adjust_surplus) { h->surplus_huge_pages++; h->surplus_huge_pages_node[nid]++; } __folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above * folio_change_private(folio, NULL) cleared it. */ folio_set_hugetlb_vmemmap_optimized(folio); arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); } static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; /* * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; /* * If folio is not vmemmap optimized (!clear_flag), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the * page and put the page back on the hugetlb free list and treat * as a surplus page. */ add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } /* * If vmemmap pages were allocated above, then we need to clear the * hugetlb flag under the hugetlb lock. */ if (folio_test_hugetlb(folio)) { spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } /* * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); folio_ref_unfreeze(folio, 1); hugetlb_free_folio(folio); } /* * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. * * free_hpage_workfn() locklessly retrieves the linked list of pages to be * freed and frees them one-by-one. As the page->mapping pointer is going * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node * structure of a lockless linked list of huge pages to be freed. */ static LLIST_HEAD(hpage_freelist); static void free_hpage_workfn(struct work_struct *work) { struct llist_node *node; node = llist_del_all(&hpage_freelist); while (node) { struct folio *folio; struct hstate *h; folio = container_of((struct address_space **)node, struct folio, mapping); node = node->next; folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ h = size_to_hstate(folio_size(folio)); __update_and_free_hugetlb_folio(h, folio); cond_resched(); } } static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { if (hugetlb_vmemmap_optimizable(h)) flush_work(&free_hpage_work); } static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { __update_and_free_hugetlb_folio(h, folio); return; } /* * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. * * Only call schedule_work() if hpage_freelist is previously * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void bulk_vmemmap_restore_error(struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; if (!list_empty(non_hvo_folios)) { /* * Free any restored hugetlb pages so that restore of the * entire list can be retried. * The idea is that in the common case of ENOMEM errors freeing * hugetlb pages with vmemmap we will free up memory so that we * can allocate vmemmap for more hugetlb pages. */ list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } else { /* * In the case where there are no folios which can be * immediately freed, we loop through the list trying to restore * vmemmap individually in the hope that someone elsewhere may * have done something to cause success (such as freeing some * memory). If unable to restore a hugetlb page, the hugetlb * page is made a surplus page and removed from the list. * If are able to restore vmemmap and free one hugetlb page, we * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); } else { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); break; } } } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *folio_list) { long ret; struct folio *folio, *t_folio; LIST_HEAD(non_hvo_folios); /* * First allocate required vmemmmap (if necessary) for all folios. * Carefully handle errors and free up any available hugetlb pages * in an effort to make forward progress. */ retry: ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); if (ret < 0) { bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); goto retry; } /* * At this point, list should be empty, ret should be >= 0 and there * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call * __folio_clear_hugetlb as this was done previously. */ VM_WARN_ON(!list_empty(folio_list)); VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &non_hvo_folios, lru) __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; for_each_hstate(h) { if (huge_page_size(h) == size) return h; } return NULL; } void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the * generic mm code. */ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); hugetlb_set_folio_subpool(folio, NULL); if (folio_test_anon(folio)) __ClearPageAnonExclusive(&folio->page); folio->mapping = NULL; restore_reserve = folio_test_hugetlb_restore_reserve(folio); folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool. */ if (!restore_reserve) { /* * A return code of zero implies that the subpool will be * under its minimum size if the reservation is not restored * after page is free. Therefore, force restore_reserve * operation. */ if (hugepage_subpool_put_pages(spool, 1) == 0) restore_reserve = true; } spin_lock_irqsave(&hugetlb_lock, flags); folio_clear_hugetlb_migratable(folio); hugetlb_cgroup_uncharge_folio(hstate_index(h), pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h)); mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; if (folio_test_hugetlb_temporary(folio)) { remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } /* * Must be called with the hugetlb lock held */ static void __prep_account_new_huge_page(struct hstate *h, int nid) { lockdep_assert_held(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; } static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { init_new_hugetlb_folio(h, folio); hugetlb_vmemmap_optimize_folio(h, folio); } static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } /* * Find and lock address space (mapping) in write mode. * * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; if (i_mmap_trylock_write(mapping)) return mapping; return NULL; } static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; bool retry = true; /* * By default we always try hard to allocate the folio with * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: folio = __folio_alloc(gfp_mask, order, nid, nmask); /* Ensure hugetlb folio won't have large_rmappable flag set. */ if (folio) folio_clear_large_rmappable(folio); if (folio && !folio_ref_freeze(folio, 1)) { folio_put(folio); if (retry) { /* retry once */ retry = false; goto retry; } /* WOW! twice in a row. */ pr_warn("HugeTLB unexpected inflated folio ref count\n"); folio = NULL; } /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a * folio this indicates an overall state change. Clear bit so * that we resume normal 'try hard' allocations. */ if (node_alloc_noretry && folio && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ if (node_alloc_noretry && !folio && alloc_try_hard) node_set(nid, *node_alloc_noretry); if (!folio) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); return folio; } static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { struct folio *folio; if (hstate_is_gigantic(h)) folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; } /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages * * Note that returned page is 'frozen': ref count of head page and all tail * pages is zero. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; if (hstate_is_gigantic(h)) folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return folio; } static void prep_and_add_allocated_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); } spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Allocates a fresh hugetlb page in a node interleaved manner. The page * will later be added to the appropriate hugetlb pool. */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry, int *next_node) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) return folio; } return NULL; } /* * Remove huge page from pool from next node to free. Attempt to keep * persistent huge pages more or less balanced over allowed nodes. * This routine only 'removes' the hugetlb page. The caller must make * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { folio = list_entry(h->hugepage_freelists[node].next, struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } return folio; } /* * Dissolve a given free hugetlb folio into free buddy pages. This function * does nothing for in-use hugetlb folios and non-hugetlb folios. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages * when the system is under memory pressure and the feature of * freeing unused vmemmap pages associated with each hugetlb page * is enabled. * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use * (allocated or reserved.) * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved) */ int dissolve_free_hugetlb_folio(struct folio *folio) { int rc = -EBUSY; retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); bool adjust_surplus = false; if (!available_huge_pages(h)) goto out; /* * We should make sure that the page is already on the free list * when it is dissolved. */ if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); /* * Theoretically, we should return -EBUSY when we * encounter this race. In fact, we have a chance * to successfully dissolve the page if we do a * retry. Because the race window is quite small. * If we seize this opportunity, it is an optimization * for increasing the success rate of dissolving page. */ goto retry; } if (h->surplus_huge_pages_node[folio_nid(folio)]) adjust_surplus = true; remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); /* * Normally update_and_free_hugtlb_folio will allocate required vmemmmap * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. * * The folio_test_hugetlb check here is because * remove_hugetlb_folio will clear hugetlb folio flag for * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } } else rc = 0; update_and_free_hugetlb_folio(h, folio, false); return rc; } out: spin_unlock_irq(&hugetlb_lock); return rc; } /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. * Also note that if dissolve_free_hugetlb_folio() returns with an error, all * free hugetlb folios that were dissolved before that error are lost. */ int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct folio *folio; int rc = 0; unsigned int order; struct hstate *h; if (!hugepages_supported()) return rc; order = huge_page_order(&default_hstate); for_each_hstate(h) order = min(order, huge_page_order(h)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { folio = pfn_folio(pfn); rc = dissolve_free_hugetlb_folio(folio); if (rc) break; } return rc; } /* * Allocates a fresh surplus page from the page allocator. */ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; if (hstate_is_gigantic(h)) return NULL; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; spin_unlock_irq(&hugetlb_lock); folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; hugetlb_vmemmap_optimize_folio(h, folio); spin_lock_irq(&hugetlb_lock); /* * nr_huge_pages needs to be adjusted within the same lock cycle * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ __prep_account_new_huge_page(h, folio_nid(folio)); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse * temporary page to workaround the nasty free_huge_folio * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); free_huge_folio(folio); return NULL; } h->surplus_huge_pages++; h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); return folio; } static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; if (hstate_is_gigantic(h)) return NULL; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; /* fresh huge pages are frozen */ folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ folio_set_hugetlb_temporary(folio); return folio; } /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!folio) folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return folio; } struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { struct folio *folio; spin_lock_irq(&hugetlb_lock); folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); if (folio) { VM_BUG_ON(!h->resv_huge_pages); h->resv_huge_pages--; } spin_unlock_irq(&hugetlb_lock); return folio; } /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { struct folio *folio; folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); if (folio) { spin_unlock_irq(&hugetlb_lock); return folio; } } spin_unlock_irq(&hugetlb_lock); /* We cannot fallback to other nodes, as we could break the per-node pool. */ if (!allow_alloc_fallback) gfp_mask |= __GFP_THISNODE; return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } static nodemask_t *policy_mbind_nodemask(gfp_t gfp) { #ifdef CONFIG_NUMA struct mempolicy *mpol = get_task_policy(current); /* * Only enforce MPOL_BIND policy which overlaps with cpuset policy * (from policy_nodemask) specifically for hugetlb case */ if (mpol->mode == MPOL_BIND && (apply_policy_zone(mpol, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) return &mpol->nodes; #endif return NULL; } /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); struct folio *folio, *tmp; int ret; long i; long needed, allocated; bool alloc_ok = true; int node; nodemask_t *mbind_nodemask, alloc_nodemask; mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); if (mbind_nodemask) nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); else alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; return 0; } allocated = 0; ret = -ENOMEM; retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { folio = NULL; /* Prioritize current node */ if (node_isset(numa_mem_id(), alloc_nodemask)) folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), numa_mem_id(), NULL); if (!folio) { for_each_node_mask(node, alloc_nodemask) { if (node == numa_mem_id()) continue; folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), node, NULL); if (folio) break; } } if (!folio) { alloc_ok = false; break; } list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; /* * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { if (alloc_ok) goto retry; /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ goto free; } /* * The surplus_list now contains _at_least_ the number of extra pages * needed to accommodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but * before they are reserved. */ needed += allocated; h->resv_huge_pages += delta; ret = 0; /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ enqueue_hugetlb_folio(h, folio); } free: spin_unlock_irq(&hugetlb_lock); /* * Free unnecessary surplus pages to the buddy allocator. * Pages have no ref count, call free_huge_folio directly. */ list_for_each_entry_safe(folio, tmp, &surplus_list, lru) free_huge_folio(folio); spin_lock_irq(&hugetlb_lock); return ret; } /* * This routine has two main purposes: * 1) Decrement the reservation count (resv_huge_pages) by the value passed * in unused_resv_pages. This corresponds to the prior adjustments made * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) goto out; /* * Part (or even all) of the reservation could have been backed * by pre-allocated pages. Only free surplus pages. */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* * We want to release as many surplus pages as possible, spread * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { struct folio *folio; folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); if (!folio) goto out; list_add(&folio->lru, &page_list); } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * vma_needs_reservation, vma_commit_reservation and vma_end_reservation * are used by the huge page allocation routines to manage reservations. * * vma_needs_reservation is called to determine if the huge page at addr * within the vma has an associated reservation. If a reservation is * needed, the value 1 is returned. The caller is then responsible for * managing the global reservation and subpool usage counts. After * the huge page has been allocated, vma_commit_reservation is called * to add the page to the reservation map. If the page allocation fails, * the reservation must be ended instead of committed. vma_end_reservation * is called in such cases. * * In the normal case, vma_commit_reservation returns the same value * as the preceding vma_needs_reservation call. The only time this * is not the case is if a reserve map was changed between calls. It * is the responsibility of the caller to notice the difference and * take appropriate action. * * vma_add_reservation is used in error paths where a reservation must * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. * * vma_del_reservation is used in error paths where an entry in the reserve * map was created during huge page allocation and must be removed. It is to * be called after calling vma_needs_reservation to determine if a reservation * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, enum vma_resv_mode mode) { struct resv_map *resv; pgoff_t idx; long ret; long dummy_out_regions_needed; resv = vma_resv_map(vma); if (!resv) return 1; idx = vma_hugecache_offset(h, vma, addr); switch (mode) { case VMA_NEEDS_RESV: ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); /* We assume that vma_reservation_* routines always operate on * 1 page, and that adding to resv map a 1 page entry can only * ever require 1 region. */ VM_BUG_ON(dummy_out_regions_needed != 1); break; case VMA_COMMIT_RESV: ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); break; case VMA_END_RESV: region_abort(resv, idx, idx + 1, 1); ret = 0; break; case VMA_ADD_RESV: if (vma->vm_flags & VM_MAYSHARE) { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } else { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } break; case VMA_DEL_RESV: if (vma->vm_flags & VM_MAYSHARE) { region_abort(resv, idx, idx + 1, 1); ret = region_del(resv, idx, idx + 1); } else { ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); /* region_add calls of range 1 should never fail. */ VM_BUG_ON(ret < 0); } break; default: BUG(); } if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; /* * We know private mapping must have HPAGE_RESV_OWNER set. * * In most cases, reserves always exist for private mappings. * However, a file associated with mapping could have been * hole punched or truncated after reserves were consumed. * As subsequent fault on such a range will not use reserves. * Subtle - The reserve map for private mappings has the * opposite meaning than that of shared mappings. If NO * entry is in the reserve map, it means a reservation exists. * If an entry exists in the reserve map, it means the * reservation has already been consumed. As a result, the * return value of this routine is the opposite of the * value returned from reserve map manipulation routines above. */ if (ret > 0) return 0; if (ret == 0) return 1; return ret; } static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); } static long vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); } static void vma_end_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } static long vma_add_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } static long vma_del_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); } /* * This routine is called to restore reservation information on error paths. * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: * 1) A reservation was in place and the folio consumed the reservation. * hugetlb_restore_reserve is set in the folio. * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_folio later in the error path will increment the * global reserve count. But, free_huge_folio does not have enough context * to adjust the reservation map. This case deals primarily with private * mappings. Adjust the reserve map here to be consistent with global * reserve count adjustments to be made by free_huge_folio. Make sure the * reserve map indicates there is a reservation present. * * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct folio *folio) { long rc = vma_needs_reservation(h, vma, address); if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear hugetlb_restore_reserve so * that global reserve count will not be incremented * by free_huge_folio. This will make it appear * as though the reservation for this folio was * consumed. This may prevent the task from * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else vma_end_reservation(h, vma, address); } else { if (!rc) { /* * This indicates there is an entry in the reserve map * not added by alloc_hugetlb_folio. We know it was added * before the alloc_hugetlb_folio call, otherwise * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ rc = vma_del_reservation(h, vma, address); if (rc < 0) /* * VERY rare out of memory condition. Since * we can not delete the entry, set * hugetlb_restore_reserve so that the reserve * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from * vma_needs_reservation call. Memory allocation is * only attempted if a new entry is needed. Therefore, * this implies there is not an entry in the * reserve map. * * For shared mappings, no entry in the map indicates * no reservation. We are done. */ if (!(vma->vm_flags & VM_MAYSHARE)) /* * For private mappings, no entry indicates * a reservation is present. Since we can * not add an entry, set hugetlb_restore_reserve * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing */ vma_end_reservation(h, vma, address); } } /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* * Freed from under us. Drop new_folio too. */ goto free_new; } else if (folio_ref_count(old_folio)) { bool isolated; /* * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { /* * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ spin_unlock_irq(&hugetlb_lock); cond_resched(); goto retry; } else { if (!new_folio) { spin_unlock_irq(&hugetlb_lock); new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) return -ENOMEM; __prep_new_hugetlb_folio(h, new_folio); goto retry; } /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() * and enqueue_hugetlb_folio() for new_folio. The counters will * remain stable since this happens under the lock. */ remove_hugetlb_folio(h, old_folio, false); /* * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); enqueue_hugetlb_folio(h, new_folio); /* * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, old_folio, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); if (new_folio) update_and_free_hugetlb_folio(h, new_folio, false); return ret; } int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; struct folio *folio = page_folio(page); int ret = -EBUSY; /* * The page might have been dissolved from under our feet, so make sure * to carefully check the state under the lock. * Return success when racing as if we dissolved the page ourselves. */ spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { spin_unlock_irq(&hugetlb_lock); return 0; } spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ if (hstate_is_gigantic(h)) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); return ret; } /* * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn * range with new folios. * @start_pfn: start pfn of the given pfn range * @end_pfn: end pfn of the given pfn range * Returns 0 on success, otherwise negated error. */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { struct hstate *h; struct folio *folio; int ret = 0; LIST_HEAD(isolate_list); while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); if (folio_test_hugetlb(folio)) { h = folio_hstate(folio); } else { start_pfn++; continue; } if (!folio_ref_count(folio)) { ret = alloc_and_dissolve_hugetlb_folio(h, folio, &isolate_list); if (ret) break; putback_movable_pages(&isolate_list); } start_pfn++; } return ret; } void wait_for_freed_hugetlb_folios(void) { if (llist_empty(&hpage_freelist)) return; flush_work(&free_hpage_work); } typedef enum { /* * For either 0/1: we checked the per-vma resv map, and one resv * count either can be reused (0), or an extra needed (1). */ MAP_CHG_REUSE = 0, MAP_CHG_NEEDED = 1, /* * Cannot use per-vma resv count can be used, hence a new resv * count is enforced. * * NOTE: This is mostly identical to MAP_CHG_NEEDED, except * that currently vma_needs_reservation() has an unwanted side * effect to either use end() or commit() to complete the * transaction. Hence it needs to differenciate from NEEDED. */ MAP_CHG_ENFORCED = 2, } map_chg_state; /* * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW * faults of hugetlb private mappings on top of a non-page-cache folio (in * which case even if there's a private vma resv map it won't cover such * allocation). New call sites should (probably) never set it to true!! * When it's set, the allocation will bypass all vma level reservations. */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; long retval, gbl_chg; map_chg_state map_chg; int ret, idx; struct hugetlb_cgroup *h_cg = NULL; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; idx = hstate_index(h); /* Whether we need a separate per-vma reservation? */ if (cow_from_owner) { /* * Special case! Since it's a CoW on top of a reserved * page, the private resv map doesn't count. So it cannot * consume the per-vma resv map even if it's reserved. */ map_chg = MAP_CHG_ENFORCED; } else { /* * Examine the region/reserve map to determine if the process * has a reservation for the page to be allocated. A return * code of zero indicates a reservation exists (no change). */ retval = vma_needs_reservation(h, vma, addr); if (retval < 0) return ERR_PTR(-ENOMEM); map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* * Whether we need a separate global reservation? * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. * Or if it can get one from the pool reservation directly. */ if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; } else { /* * If we have the vma reservation ready, no need for extra * global reservation. */ gbl_chg = 0; } /* * If this allocation is not consuming a per-vma reservation, * charge the hugetlb cgroup now. */ if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_subpool_put; } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_uncharge_cgroup_reservation; spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } /* * Either dequeued or buddy-allocated folio needs to add special * mark to the folio when it consumes a global reservation. */ if (!gbl_chg) { folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } spin_unlock_irq(&hugetlb_lock); hugetlb_set_folio_subpool(folio, spool); if (map_chg != MAP_CHG_ENFORCED) { /* commit() is only needed if the map_chg is not enforced */ retval = vma_commit_reservation(h, vma, addr); /* * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { long rsv_adjust; rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (map_chg) { spin_lock_irq(&hugetlb_lock); hugetlb_cgroup_uncharge_folio_rsvd( hstate_index(h), pages_per_huge_page(h), folio); spin_unlock_irq(&hugetlb_lock); } } } ret = mem_cgroup_charge_hugetlb(folio, gfp); /* * Unconditionally increment NR_HUGETLB here. If it turns out that * mem_cgroup_charge_hugetlb failed, then immediately free the page and * decrement NR_HUGETLB. */ lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); if (ret == -ENOMEM) { free_huge_folio(folio); return ERR_PTR(-ENOMEM); } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: if (map_chg) hugepage_subpool_put_pages(spool, 1); out_end_reservation: if (map_chg != MAP_CHG_ENFORCED) vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact) { struct huge_bootmem_page *m; int listnode = nid; if (hugetlb_early_cma(h)) m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact); else { if (node_exact) m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); else { m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); /* * For pre-HVO to work correctly, pages need to be on * the list for the node they were actually allocated * from. That node may be different in the case of * fallback by memblock_alloc_try_nid_raw. So, * extract the actual node first. */ if (m) listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m))); } if (m) { m->flags = 0; m->cma = NULL; } } if (m) { /* * Use the beginning of the huge page to store the * huge_bootmem_page struct (until gather_bootmem * puts them into the mem_map). * * Put them into a private list first because mem_map * is not up yet. */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages[listnode]); m->hstate = h; } return m; } int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node = nid; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { m = alloc_bootmem(h, node, true); if (!m) return 0; goto found; } /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { m = alloc_bootmem(h, node, false); if (!m) return 0; goto found; } found: /* * Only initialize the head struct page in memmap_init_reserved_pages, * rest of the struct pages will be initialized by the HugeTLB * subsystem itself. * The head struct page is used to get folio information by the HugeTLB * subsystem like zone id and node id. */ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), huge_page_size(h) - PAGE_SIZE); return 1; } /* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, unsigned long start_page_number, unsigned long end_page_number) { enum zone_type zone = zone_idx(folio_zone(folio)); int nid = folio_nid(folio); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; int ret; for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); VM_BUG_ON(!ret); } } static void __init hugetlb_folio_init_vmemmap(struct folio *folio, struct hstate *h, unsigned long nr_pages) { int ret; /* Prepare folio head */ __folio_clear_reserved(folio); __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); prep_compound_head((struct page *)folio, huge_page_order(h)); } static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) { return m->flags & HUGE_BOOTMEM_HVO; } static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m) { return m->flags & HUGE_BOOTMEM_CMA; } /* * memblock-allocated pageblocks might not have the migrate type set * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE) * here, or MIGRATE_CMA if this was a page allocated through an early CMA * reservation. * * In case of vmemmap optimized folios, the tail vmemmap pages are mapped * read-only, but that's ok - for sparse vmemmap this does not write to * the page structure. */ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, struct hstate *h) { unsigned long nr_pages = pages_per_huge_page(h), i; WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio))); for (i = 0; i < nr_pages; i += pageblock_nr_pages) { if (folio_test_hugetlb_cma(folio)) init_cma_pageblock(folio_page(folio, i)); else set_pageblock_migratetype(folio_page(folio, i), MIGRATE_MOVABLE); } } static void __init prep_and_add_bootmem_folios(struct hstate *h, struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* * If HVO fails, initialize all tail struct pages * We do not worry about potential long lock hold * time as this is early in boot and there should * be no contention. */ hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } hugetlb_bootmem_init_migratetype(folio, h); /* Subdivide locks to achieve better parallel performance */ spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } bool __init hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m) { unsigned long start_pfn; bool valid; if (m->flags & HUGE_BOOTMEM_ZONES_VALID) { /* * Already validated, skip check. */ return true; } if (hugetlb_bootmem_page_earlycma(m)) { valid = cma_validate_zones(m->cma); goto out; } start_pfn = virt_to_phys(m) >> PAGE_SHIFT; valid = !pfn_range_intersects_zones(nid, start_pfn, pages_per_huge_page(m->hstate)); out: if (!valid) hstate_boot_nrinvalid[hstate_index(m->hstate)]++; return valid; } /* * Free a bootmem page that was found to be invalid (intersecting with * multiple zones). * * Since it intersects with multiple zones, we can't just do a free * operation on all pages at once, but instead have to walk all * pages, freeing them one by one. */ static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page, struct hstate *h) { unsigned long npages = pages_per_huge_page(h); unsigned long pfn; while (npages--) { pfn = page_to_pfn(page); __init_page_from_nid(pfn, nid); free_reserved_page(page); page++; } } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); struct huge_bootmem_page *m, *tm; struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; h = m->hstate; if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Can't use this page. Initialize the * page structures if that hasn't already * been done, and give them to the page * allocator. */ hugetlb_bootmem_free_invalid_page(nid, page, h); continue; } /* * It is possible to have multiple huge page sizes (hstates) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); init_new_hugetlb_folio(h, folio); if (hugetlb_bootmem_page_prehvo(m)) /* * If pre-HVO was done, just set the * flag, the HVO code will then skip * this folio. */ folio_set_hugetlb_vmemmap_optimized(folio); if (hugetlb_bootmem_page_earlycma(m)) folio_set_hugetlb_cma(folio); list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages * in order to fix confusing memory reports from free(1) and * other side-effects, like CommitLimit going negative. * * For CMA pages, this is done in init_cma_pageblock * (via hugetlb_bootmem_init_migratetype), so skip it here. */ if (!folio_test_hugetlb_cma(folio)) adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } prep_and_add_bootmem_folios(h, &folio_list); } static void __init gather_bootmem_prealloc_parallel(unsigned long start, unsigned long end, void *arg) { int nid; for (nid = start; nid < end; nid++) gather_bootmem_prealloc_node(nid); } static void __init gather_bootmem_prealloc(void) { struct padata_mt_job job = { .thread_fn = gather_bootmem_prealloc_parallel, .fn_arg = NULL, .start = 0, .size = nr_node_ids, .align = 1, .min_chunk = 1, .max_threads = num_node_state(N_MEMORY), .numa_aware = true, }; padata_do_multithreaded(&job); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; char buf[32]; LIST_HEAD(folio_list); for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h, nid)) break; } else { struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); if (!folio) break; list_add(&folio->lru, &folio_list); } cond_resched(); } if (!list_empty(&folio_list)) prep_and_add_allocated_folios(h, &folio_list); if (i == h->max_huge_pages_node[nid]) return; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", h->max_huge_pages_node[nid], buf, nid, i); h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); h->max_huge_pages_node[nid] = i; } static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h) { int i; bool node_specific_alloc = false; for_each_online_node(i) { if (h->max_huge_pages_node[i] > 0) { hugetlb_hstate_alloc_pages_onenode(h, i); node_specific_alloc = true; } } return node_specific_alloc; } static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h) { if (allocated < h->max_huge_pages) { char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", h->max_huge_pages, buf, allocated); h->max_huge_pages = allocated; } } static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) { struct hstate *h = (struct hstate *)arg; int i, num = end - start; nodemask_t node_alloc_noretry; LIST_HEAD(folio_list); int next_node = first_online_node; /* Bit mask controlling how hard we retry per-node allocations.*/ nodes_clear(node_alloc_noretry); for (i = 0; i < num; ++i) { struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], &node_alloc_noretry, &next_node); if (!folio) break; list_move(&folio->lru, &folio_list); cond_resched(); } prep_and_add_allocated_folios(h, &folio_list); } static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) { unsigned long i; for (i = 0; i < h->max_huge_pages; ++i) { if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; cond_resched(); } return i; } static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) { struct padata_mt_job job = { .fn_arg = h, .align = 1, .numa_aware = true }; unsigned long jiffies_start; unsigned long jiffies_end; job.thread_fn = hugetlb_pages_alloc_boot_node; job.start = 0; job.size = h->max_huge_pages; /* * job.max_threads is 25% of the available cpu threads by default. * * On large servers with terabytes of memory, huge page allocation * can consume a considerably amount of time. * * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages. * 2MiB huge pages. Using more threads can significantly improve allocation time. * * +-----------------------+-------+-------+-------+-------+-------+ * | threads | 8 | 16 | 32 | 64 | 128 | * +-----------------------+-------+-------+-------+-------+-------+ * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s | * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s | * +-----------------------+-------+-------+-------+-------+-------+ */ if (hugepage_allocation_threads == 0) { hugepage_allocation_threads = num_online_cpus() / 4; hugepage_allocation_threads = max(hugepage_allocation_threads, 1); } job.max_threads = hugepage_allocation_threads; job.min_chunk = h->max_huge_pages / hugepage_allocation_threads; jiffies_start = jiffies; padata_do_multithreaded(&job); jiffies_end = jiffies; pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", jiffies_to_msecs(jiffies_end - jiffies_start), hugepage_allocation_threads); return h->nr_huge_pages; } /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. * - For gigantic pages, this is called early in the boot process and * pages are allocated from memblock allocated or something similar. * Gigantic pages are actually added to pools later with the routine * gather_bootmem_prealloc. * - For non-gigantic pages, this is called later in the boot process after * all of mm is up and functional. Pages are allocated from buddy and * then added to hugetlb pools. */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long allocated; /* * Skip gigantic hugepages allocation if early CMA * reservations are not available. */ if (hstate_is_gigantic(h) && hugetlb_cma_total_size() && !hugetlb_early_cma(h)) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); return; } /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; /* below will do all node balanced alloc */ if (hstate_is_gigantic(h)) allocated = hugetlb_gigantic_pages_alloc_boot(h); else allocated = hugetlb_pages_alloc_boot(h); hugetlb_hstate_alloc_pages_errcheck(allocated, h); } static void __init hugetlb_init_hstates(void) { struct hstate *h, *h2; for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); /* * Set demote order for each hstate. Note that * h->demote_order is initially 0. * - We can not demote gigantic pages if runtime freeing * is not supported, so skip this. * - If CMA allocation is possible, we can not demote * HUGETLB_PAGE_ORDER or smaller size pages. */ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) continue; if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; for_each_hstate(h2) { if (h2 == h) continue; if (h2->order < h->order && h2->order > h->demote_order) h->demote_order = h2->order; } } } static void __init report_hugepages(void) { struct hstate *h; unsigned long nrinvalid; for_each_hstate(h) { char buf[32]; nrinvalid = hstate_boot_nrinvalid[hstate_index(h)]; h->max_huge_pages -= nrinvalid; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); if (nrinvalid) pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", buf, nrinvalid, nrinvalid > 1 ? "s" : ""); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } } #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; /* * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; if (folio_test_highmem(folio)) continue; remove_hugetlb_folio(h, folio, false); list_add(&folio->lru, &page_list); } } out: spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { } #endif /* * Increment or decrement surplus_huge_pages. Keep node-specific counters * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { int nr_nodes, node; lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } } else { for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node] < h->nr_huge_pages_node[node]) goto found; } } return 0; found: h->surplus_huge_pages += delta; h->surplus_huge_pages_node[node] += delta; return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long persistent_free_count; unsigned long min_count; unsigned long allocated; struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* * Bit mask controlling how hard we retry per-node allocations. * If we can not allocate the bit mask, do not attempt to allocate * the requested huge pages. */ if (node_alloc_noretry) nodes_clear(*node_alloc_noretry); else return -ENOMEM; /* * resize_lock mutex prevents concurrent adjustments to number of * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. * Changing node specific huge page count may require a corresponding * change to the global count. In any case, the passed node mask * (nodes_allowed) will restrict alloc/free to the specified node. */ if (nid != NUMA_NO_NODE) { unsigned long old_count = count; count += persistent_huge_pages(h) - (h->nr_huge_pages_node[nid] - h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted * to allocate as many huge pages as possible. Set count to * largest possible value to align with their intention. */ if (count < old_count) count = ULONG_MAX; } /* * Gigantic pages runtime allocation depend on the capability for large * page range allocation. * If the system does not provide this feature, return an error when * the user tries to allocate gigantic pages but let the user free the * boottime allocated gigantic pages. */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } /* Fall through to decrease pool */ } /* * Increase the pool size * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } allocated = 0; while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page * and reducing the surplus. */ spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry, &h->next_nid_to_alloc); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } list_add(&folio->lru, &page_list); allocated++; /* Bail for signals. Probably ctrl-c from user */ if (signal_pending(current)) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); goto out; } spin_lock_irq(&hugetlb_lock); } /* Add allocated pages to the pool */ if (!list_empty(&page_list)) { spin_unlock_irq(&hugetlb_lock); prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); } /* * Decrease the pool size * First return free pages to the buddy allocator (being careful * to keep enough around to satisfy reservations). Then place * pages into surplus state as needed so the pool will shrink * to the desired size as pages become free. * * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. * * min_count is the expected number of persistent pages, we * shouldn't calculate min_count by using * resv_huge_pages + persistent_huge_pages() - free_huge_pages, * because there may exist free surplus huge pages, and this will * lead to subtracting twice. Free surplus huge pages come from HVO * failing to restore vmemmap, see comments in the callers of * hugetlb_vmemmap_restore_folio(). Thus, we should calculate * persistent free count first. */ persistent_free_count = h->free_huge_pages; if (h->free_huge_pages > persistent_huge_pages(h)) { if (h->free_huge_pages > h->surplus_huge_pages) persistent_free_count -= h->surplus_huge_pages; else persistent_free_count = 0; } min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); /* * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); flush_free_hpage_work(h); spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return 0; } static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, struct list_head *src_list) { long rc; struct folio *folio, *next; LIST_HEAD(dst_list); LIST_HEAD(ret_list); rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); list_splice_init(&ret_list, src_list); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * * Note that we already hold src->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ mutex_lock(&dst->resize_lock); list_for_each_entry_safe(folio, next, src_list, lru) { int i; if (folio_test_hugetlb_vmemmap_optimized(folio)) continue; list_del(&folio->lru); split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst)); for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); /* Careful: see __split_huge_page_tail() */ struct folio *new_folio = (struct folio *)page; clear_compound_head(page); prep_compound_page(page, dst->order); new_folio->mapping = NULL; init_new_hugetlb_folio(dst, new_folio); list_add(&new_folio->lru, &dst_list); } } prep_and_add_allocated_folios(dst, &dst_list); mutex_unlock(&dst->resize_lock); return rc; } static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; struct hstate *dst; long rc = 0; long nr_demoted = 0; lockdep_assert_held(&hugetlb_lock); /* We should never get here if no demote order */ if (!src->demote_order) { pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); return -EINVAL; /* internal error */ } dst = size_to_hstate(PAGE_SIZE << src->demote_order); for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { LIST_HEAD(list); struct folio *folio, *next; list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { if (folio_test_hwpoison(folio)) continue; remove_hugetlb_folio(src, folio, false); list_add(&folio->lru, &list); if (++nr_demoted == nr_to_demote) break; } spin_unlock_irq(&hugetlb_lock); rc = demote_free_hugetlb_folios(src, dst, &list); spin_lock_irq(&hugetlb_lock); list_for_each_entry_safe(folio, next, &list, lru) { list_del(&folio->lru); add_hugetlb_folio(src, folio, false); nr_demoted--; } if (rc < 0 || nr_demoted == nr_to_demote) break; } /* * Not absolutely necessary, but for consistency update max_huge_pages * based on pool changes for the demoted page. */ src->max_huge_pages -= nr_demoted; dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); if (rc < 0) return rc; if (nr_demoted) return nr_demoted; /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. */ return -EBUSY; } #define HSTATE_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define HSTATE_ATTR_WO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_WO(_name) #define HSTATE_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static struct kobject *hugepages_kobj; static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) { int i; for (i = 0; i < HUGE_MAX_HSTATE; i++) if (hstate_kobjs[i] == kobj) { if (nidp) *nidp = NUMA_NO_NODE; return &hstates[i]; } return kobj_to_node_hstate(kobj, nidp); } static ssize_t nr_hugepages_show_common(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long nr_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) nr_huge_pages = h->nr_huge_pages; else nr_huge_pages = h->nr_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", nr_huge_pages); } static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h, int nid, unsigned long count, size_t len) { int err; nodemask_t nodes_allowed, *n_mask; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return -EINVAL; if (nid == NUMA_NO_NODE) { /* * global hstate attribute */ if (!(obey_mempolicy && init_nodemask_of_mempolicy(&nodes_allowed))) n_mask = &node_states[N_MEMORY]; else n_mask = &nodes_allowed; } else { /* * Node specific request. count adjustment happens in * set_max_huge_pages() after acquiring hugetlb_lock. */ init_nodemask_of_node(&nodes_allowed, nid); n_mask = &nodes_allowed; } err = set_max_huge_pages(h, count, nid, n_mask); return err ? err : len; } static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, const char *buf, size_t len) { struct hstate *h; unsigned long count; int nid; int err; err = kstrtoul(buf, 10, &count); if (err) return err; h = kobj_to_hstate(kobj, &nid); return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); } static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); #ifdef CONFIG_NUMA /* * hstate attribute for optionally mempolicy-based constraint on persistent * huge page alloc/free. */ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); if (hstate_is_gigantic(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); if (err) return err; spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; spin_unlock_irq(&hugetlb_lock); return count; } HSTATE_ATTR(nr_overcommit_hugepages); static ssize_t free_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long free_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) free_huge_pages = h->free_huge_pages; else free_huge_pages = h->free_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); static ssize_t surplus_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h; unsigned long surplus_huge_pages; int nid; h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) surplus_huge_pages = h->surplus_huge_pages; else surplus_huge_pages = h->surplus_huge_pages_node[nid]; return sysfs_emit(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); static ssize_t demote_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { unsigned long nr_demote; unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; int err; int nid; err = kstrtoul(buf, 10, &nr_demote); if (err) return err; h = kobj_to_hstate(kobj, &nid); if (nid != NUMA_NO_NODE) { init_nodemask_of_node(&nodes_allowed, nid); n_mask = &nodes_allowed; } else { n_mask = &node_states[N_MEMORY]; } /* Synchronize with other sysfs operations modifying huge pages */ mutex_lock(&h->resize_lock); spin_lock_irq(&hugetlb_lock); while (nr_demote) { long rc; /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. */ if (nid != NUMA_NO_NODE) nr_available = h->free_huge_pages_node[nid]; else nr_available = h->free_huge_pages; nr_available -= h->resv_huge_pages; if (!nr_available) break; rc = demote_pool_huge_page(h, n_mask, nr_demote); if (rc < 0) { err = rc; break; } nr_demote -= rc; } spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); if (err) return err; return len; } HSTATE_ATTR_WO(demote); static ssize_t demote_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; return sysfs_emit(buf, "%lukB\n", demote_size); } static ssize_t demote_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct hstate *h, *demote_hstate; unsigned long demote_size; unsigned int demote_order; demote_size = (unsigned long)memparse(buf, NULL); demote_hstate = size_to_hstate(demote_size); if (!demote_hstate) return -EINVAL; demote_order = demote_hstate->order; if (demote_order < HUGETLB_PAGE_ORDER) return -EINVAL; /* demote order must be smaller than hstate order */ h = kobj_to_hstate(kobj, NULL); if (demote_order >= h->order) return -EINVAL; /* resize_lock synchronizes access to demote size and writes */ mutex_lock(&h->resize_lock); h->demote_order = demote_order; mutex_unlock(&h->resize_lock); return count; } HSTATE_ATTR(demote_size); static struct attribute *hstate_attrs[] = { &nr_hugepages_attr.attr, &nr_overcommit_hugepages_attr.attr, &free_hugepages_attr.attr, &resv_hugepages_attr.attr, &surplus_hugepages_attr.attr, #ifdef CONFIG_NUMA &nr_hugepages_mempolicy_attr.attr, #endif NULL, }; static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; static struct attribute *hstate_demote_attrs[] = { &demote_size_attr.attr, &demote_attr.attr, NULL, }; static const struct attribute_group hstate_demote_attr_group = { .attrs = hstate_demote_attrs, }; static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, const struct attribute_group *hstate_attr_group) { int retval; int hi = hstate_index(h); hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); if (!hstate_kobjs[hi]) return -ENOMEM; retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); if (retval) { kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; return retval; } if (h->demote_order) { retval = sysfs_create_group(hstate_kobjs[hi], &hstate_demote_attr_group); if (retval) { pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); kobject_put(hstate_kobjs[hi]); hstate_kobjs[hi] = NULL; return retval; } } return 0; } #ifdef CONFIG_NUMA static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, * with node devices in node_devices[] using a parallel array. The array * index of a node device or _hstate == node id. * This is here to avoid any static dependency of the node device driver, in * the base kernel, on the hugetlb module. */ struct node_hstate { struct kobject *hugepages_kobj; struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; }; static struct node_hstate node_hstates[MAX_NUMNODES]; /* * A subset of global hstate attributes for node devices */ static struct attribute *per_node_hstate_attrs[] = { &nr_hugepages_attr.attr, &free_hugepages_attr.attr, &surplus_hugepages_attr.attr, NULL, }; static const struct attribute_group per_node_hstate_attr_group = { .attrs = per_node_hstate_attrs, }; /* * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. * Returns node id via non-NULL nidp. */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) { int nid; for (nid = 0; nid < nr_node_ids; nid++) { struct node_hstate *nhs = &node_hstates[nid]; int i; for (i = 0; i < HUGE_MAX_HSTATE; i++) if (nhs->hstate_kobjs[i] == kobj) { if (nidp) *nidp = nid; return &hstates[i]; } } BUG(); return NULL; } /* * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; if (!nhs->hugepages_kobj) return; /* no hstate attributes */ for_each_hstate(h) { int idx = hstate_index(h); struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; if (!hstate_kobj) continue; if (h->demote_order) sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); kobject_put(hstate_kobj); nhs->hstate_kobjs[idx] = NULL; } kobject_put(nhs->hugepages_kobj); nhs->hugepages_kobj = NULL; } /* * Register hstate attributes for a single node device. * No-op if attributes already registered. */ void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; if (!hugetlb_sysfs_initialized) return; if (nhs->hugepages_kobj) return; /* already allocated */ nhs->hugepages_kobj = kobject_create_and_add("hugepages", &node->dev.kobj); if (!nhs->hugepages_kobj) return; for_each_hstate(h) { err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, nhs->hstate_kobjs, &per_node_hstate_attr_group); if (err) { pr_err("HugeTLB: Unable to add hstate %s for node %d\n", h->name, node->dev.id); hugetlb_unregister_node(node); break; } } } /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have * registered their associated device by this time. */ static void __init hugetlb_register_all_nodes(void) { int nid; for_each_online_node(nid) hugetlb_register_node(node_devices[nid]); } #else /* !CONFIG_NUMA */ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) { BUG(); if (nidp) *nidp = -1; return NULL; } static void hugetlb_register_all_nodes(void) { } #endif static void __init hugetlb_sysfs_init(void) { struct hstate *h; int err; hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); if (!hugepages_kobj) return; for_each_hstate(h) { err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, hstate_kobjs, &hstate_attr_group); if (err) pr_err("HugeTLB: Unable to add hstate %s\n", h->name); } #ifdef CONFIG_NUMA hugetlb_sysfs_initialized = true; #endif hugetlb_register_all_nodes(); } #ifdef CONFIG_SYSCTL static void hugetlb_sysctl_init(void); #else static inline void hugetlb_sysctl_init(void) { } #endif static int __init hugetlb_init(void) { int i; BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < __NR_HPAGEFLAGS); if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); return 0; } /* * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some * architectures depend on setup being done here. */ hugetlb_add_hstate(HUGETLB_PAGE_ORDER); if (!parsed_default_hugepagesz) { /* * If we did not parse a default huge page size, set * default_hstate_idx to HPAGE_SIZE hstate. And, if the * number of huge pages for this default size was implicitly * specified, set that here as well. * Note that the implicit setting will overwrite an explicit * setting. A warning will be printed in this case. */ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); if (default_hstate_max_huge_pages) { if (default_hstate.max_huge_pages) { char buf[32]; string_get_size(huge_page_size(&default_hstate), 1, STRING_UNITS_2, buf, 32); pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", default_hstate.max_huge_pages, buf); pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", default_hstate_max_huge_pages); } default_hstate.max_huge_pages = default_hstate_max_huge_pages; for_each_online_node(i) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } } hugetlb_cma_check(); hugetlb_init_hstates(); gather_bootmem_prealloc(); report_hugepages(); hugetlb_sysfs_init(); hugetlb_cgroup_file_init(); hugetlb_sysctl_init(); #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else num_fault_mutexes = 1; #endif hugetlb_fault_mutex_table = kmalloc_array(num_fault_mutexes, sizeof(struct mutex), GFP_KERNEL); BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } subsys_initcall(hugetlb_init); /* Overwritten by architectures with more huge page sizes */ bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) { return size == HPAGE_SIZE; } void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/SZ_1K); parsed_hstate = h; } bool __init __weak hugetlb_node_alloc_supported(void) { return true; } static void __init hugepages_clear_pages_in_node(void) { if (!hugetlb_max_hstate) { default_hstate_max_huge_pages = 0; memset(default_hugepages_in_node, 0, sizeof(default_hugepages_in_node)); } else { parsed_hstate->max_huge_pages = 0; memset(parsed_hstate->max_huge_pages_node, 0, sizeof(parsed_hstate->max_huge_pages_node)); } } static __init int hugetlb_add_param(char *s, int (*setup)(char *)) { size_t len; char *p; if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS) return -EINVAL; len = strlen(s) + 1; if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf)) return -EINVAL; p = &hstate_cmdline_buf[hstate_cmdline_index]; memcpy(p, s, len); hstate_cmdline_index += len; hugetlb_params[hugetlb_param_index].val = p; hugetlb_params[hugetlb_param_index].setup = setup; hugetlb_param_index++; return 0; } static __init void hugetlb_parse_params(void) { int i; struct hugetlb_cmdline *hcp; for (i = 0; i < hugetlb_param_index; i++) { hcp = &hugetlb_params[i]; hcp->setup(hcp->val); } hugetlb_cma_validate_params(); } /* * hugepages command line processing * hugepages normally follows a valid hugepagsz or default_hugepagsz * specification. If not, ignore the hugepages value. hugepages can also * be the first huge page command line option in which case it implicitly * specifies the number of huge pages for the default size. */ static int __init hugepages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; int node = NUMA_NO_NODE; int count; unsigned long tmp; char *p = s; if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; return -EINVAL; } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter * yet, so this hugepages= parameter goes to the "default hstate". * Otherwise, it goes with the previously parsed hugepagesz or * default_hugepagesz. */ else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); return 1; } while (*p) { count = 0; if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; /* Parameter is node format */ if (p[count] == ':') { if (!hugetlb_node_alloc_supported()) { pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); return 1; } if (tmp >= MAX_NUMNODES || !node_online(tmp)) goto invalid; node = array_index_nospec(tmp, MAX_NUMNODES); p += count + 1; /* Parse hugepages */ if (sscanf(p, "%lu%n", &tmp, &count) != 1) goto invalid; if (!hugetlb_max_hstate) default_hugepages_in_node[node] = tmp; else parsed_hstate->max_huge_pages_node[node] = tmp; *mhp += tmp; /* Go to parse next node*/ if (p[count] == ',') p += count + 1; else break; } else { if (p != s) goto invalid; *mhp = tmp; break; } } last_mhp = mhp; return 0; invalid: pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); hugepages_clear_pages_in_node(); return -EINVAL; } hugetlb_early_param("hugepages", hugepages_setup); /* * hugepagesz command line processing * A specific huge page size can only be specified once with hugepagesz. * hugepagesz is followed by hugepages on the command line. The global * variable 'parsed_valid_hugepagesz' is used to determine if prior * hugepagesz argument was valid. */ static int __init hugepagesz_setup(char *s) { unsigned long size; struct hstate *h; parsed_valid_hugepagesz = false; size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); return -EINVAL; } h = size_to_hstate(size); if (h) { /* * hstate for this size already exists. This is normally * an error, but is allowed if the existing hstate is the * default hstate. More specifically, it is only allowed if * the number of huge pages for the default hstate was not * previously specified. */ if (!parsed_default_hugepagesz || h != &default_hstate || default_hstate.max_huge_pages) { pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); return -EINVAL; } /* * No need to call hugetlb_add_hstate() as hstate already * exists. But, do set parsed_hstate so that a following * hugepages= parameter will be applied to this hstate. */ parsed_hstate = h; parsed_valid_hugepagesz = true; return 0; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; return 0; } hugetlb_early_param("hugepagesz", hugepagesz_setup); /* * default_hugepagesz command line input * Only one instance of default_hugepagesz allowed on command line. */ static int __init default_hugepagesz_setup(char *s) { unsigned long size; int i; parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); return -EINVAL; } size = (unsigned long)memparse(s, NULL); if (!arch_hugetlb_valid_size(size)) { pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); return -EINVAL; } hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); parsed_valid_hugepagesz = true; parsed_default_hugepagesz = true; default_hstate_idx = hstate_index(size_to_hstate(size)); /* * The number of default huge pages (for this size) could have been * specified as the first hugetlb parameter: hugepages=X. If so, * then default_hstate_max_huge_pages is set. If the default huge * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be * allocated here from bootmem allocator. */ if (default_hstate_max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; /* * Since this is an early parameter, we can't check * NUMA node state yet, so loop through MAX_NUMNODES. */ for (i = 0; i < MAX_NUMNODES; i++) { if (default_hugepages_in_node[i] != 0) default_hstate.max_huge_pages_node[i] = default_hugepages_in_node[i]; } default_hstate_max_huge_pages = 0; } return 0; } hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); static bool __hugetlb_bootmem_allocated __initdata; bool __init hugetlb_bootmem_allocated(void) { return __hugetlb_bootmem_allocated; } void __init hugetlb_bootmem_alloc(void) { struct hstate *h; int i; if (__hugetlb_bootmem_allocated) return; for (i = 0; i < MAX_NUMNODES; i++) INIT_LIST_HEAD(&huge_boot_pages[i]); hugetlb_parse_params(); for_each_hstate(h) { h->next_nid_to_alloc = first_online_node; h->next_nid_to_free = first_online_node; if (hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } __hugetlb_bootmem_allocated = true; } /* * hugepage_alloc_threads command line parsing. * * When set, use this specific number of threads for the boot * allocation of hugepages. */ static int __init hugepage_alloc_threads_setup(char *s) { unsigned long allocation_threads; if (kstrtoul(s, 0, &allocation_threads) != 0) return 1; if (allocation_threads == 0) return 1; hugepage_allocation_threads = allocation_threads; return 1; } __setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup); static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } return nr; } #ifdef CONFIG_SYSCTL static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos, unsigned long *out) { struct ctl_table dup_table; /* * In order to avoid races with __do_proc_doulongvec_minmax(), we * can duplicate the @table and alter the duplicate of it. */ dup_table = *table; dup_table.data = out; return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); } static int hugetlb_sysctl_handler_common(bool obey_mempolicy, const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -EOPNOTSUPP; ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, &tmp); if (ret) goto out; if (write) ret = __nr_hugepages_store_common(obey_mempolicy, h, NUMA_NO_NODE, tmp, *length); out: return ret; } static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, buffer, length, ppos); } #ifdef CONFIG_NUMA static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); } #endif /* CONFIG_NUMA */ static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; int ret; if (!hugepages_supported()) return -EOPNOTSUPP; tmp = h->nr_overcommit_huge_pages; if (write && hstate_is_gigantic(h)) return -EINVAL; ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, &tmp); if (ret) goto out; if (write) { spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock_irq(&hugetlb_lock); } out: return ret; } static const struct ctl_table hugetlb_table[] = { { .procname = "nr_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, }, #ifdef CONFIG_NUMA { .procname = "nr_hugepages_mempolicy", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, #endif { .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, .maxlen = sizeof(gid_t), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nr_overcommit_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, }; static void __init hugetlb_sysctl_init(void) { register_sysctl_init("vm", hugetlb_table); } #endif /* CONFIG_SYSCTL */ void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; unsigned long total = 0; if (!hugepages_supported()) return; for_each_hstate(h) { unsigned long count = h->nr_huge_pages; total += huge_page_size(h) * count; if (h == &default_hstate) seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" "HugePages_Rsvd: %5lu\n" "HugePages_Surp: %5lu\n" "Hugepagesize: %8lu kB\n", count, h->free_huge_pages, h->resv_huge_pages, h->surplus_huge_pages, huge_page_size(h) / SZ_1K); } seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); } int hugetlb_report_node_meminfo(char *buf, int len, int nid) { struct hstate *h = &default_hstate; if (!hugepages_supported()) return 0; return sysfs_emit_at(buf, len, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" "Node %d HugePages_Surp: %5u\n", nid, h->nr_huge_pages_node[nid], nid, h->free_huge_pages_node[nid], nid, h->surplus_huge_pages_node[nid]); } void hugetlb_show_meminfo_node(int nid) { struct hstate *h; if (!hugepages_supported()) return; for_each_hstate(h) printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", nid, h->nr_huge_pages_node[nid], h->free_huge_pages_node[nid], h->surplus_huge_pages_node[nid], huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "HugetlbPages:\t%8lu kB\n", K(atomic_long_read(&mm->hugetlb_usage))); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { struct hstate *h; unsigned long nr_total_pages = 0; for_each_hstate(h) nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; if (!delta) return 0; spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such * reservation is completely rubbish in the presence of cpuset because * the reservation is not checked against page availability for the * current cpuset. Application can still potentially OOM'ed by kernel * with lack of free htlb page in cpuset that the task is in. * Attempt to enforce strict accounting with cpuset is almost * impossible (or too ugly) because cpuset is too fluid that * task or memory node can be dynamically moved between cpusets. * * The change of semantics for shared hugetlb mapping with cpuset is * undesirable. However, in order to preserve some of the semantics, * we fall back to check against current free page availability as * a best attempt and hopefully to minimize the impact of changing * semantics that cpuset has. * * Apart from cpuset, we also have memory policy mechanism that * also determines from which node the kernel will allocate memory * in a NUMA system. So similar to cpuset, we also should consider * the memory policy of the current task. Similar to the description * above. */ if (delta > 0) { if (gather_surplus_pages(h, delta) < 0) goto out; if (delta > allowed_mems_nr(h)) { return_unused_surplus_pages(h, delta); goto out; } } ret = 0; if (delta < 0) return_unused_surplus_pages(h, (unsigned long) -delta); out: spin_unlock_irq(&hugetlb_lock); return ret; } static void hugetlb_vm_op_open(struct vm_area_struct *vma) { struct resv_map *resv = vma_resv_map(vma); /* * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA * has a reference to the reservation map it cannot disappear until * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } /* * vma_lock structure for sharable mappings is vma specific. * Clear old pointer (if copied via vm_area_dup) and allocate * new structure. Before clearing, make sure vma_lock is not * for this vma. */ if (vma->vm_flags & VM_MAYSHARE) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; if (vma_lock) { if (vma_lock->vma != vma) { vma->vm_private_data = NULL; hugetlb_vma_lock_alloc(vma); } else pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); } else hugetlb_vma_lock_alloc(vma); } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; hugetlb_vma_lock_free(vma); resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - region_count(resv, start, end); hugetlb_cgroup_uncharge_counter(resv, start, end); if (reserve) { /* * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ gbl_reserve = hugepage_subpool_put_pages(spool, reserve); hugetlb_acct_memory(h, -gbl_reserve); } kref_put(&resv->refs, resv_map_release); } static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. */ if (addr & ~PUD_MASK) { /* * hugetlb_vm_op_split is called right before we attempt to * split the VMA. We will need to unshare PMDs in the old and * new VMAs, so let's unshare before we split. */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; if (floor >= vma->vm_start && ceil <= vma->vm_end) hugetlb_unshare_pmds(vma, floor, ceil); } return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) { return huge_page_size(hstate_vma(vma)); } /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; } /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. * This is because under System V memory model, mappings created via * shmget/shmat with "huge page" specified are backed by hugetlbfs files, * their original vm_ops are overwritten with shm_vm_ops. */ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, bool try_mkwrite) { pte_t entry; unsigned int shift = huge_page_shift(hstate_vma(vma)); if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, vma->vm_page_prot))); } else { entry = huge_pte_wrprotect(mk_huge_pte(page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; } static void set_huge_ptep_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t entry; entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { if (vma->vm_flags & VM_WRITE) set_huge_ptep_writable(vma, address, ptep); } bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); if (is_migration_entry(swp)) return true; else return false; } bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) return false; swp = pte_to_swp_entry(pte); if (is_hwpoison_entry(swp)) return true; else return false; } static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { pte_t newpte = make_huge_pte(vma, &new_folio->page, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pte_t *src_pte, *dst_pte, entry; struct folio *pte_folio; unsigned long addr; bool cow = is_cow_mapping(src_vma->vm_flags); struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; if (cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src->write_protect_seq); } else { /* * For shared mappings the vma lock must be held before * calling hugetlb_walk() in the src vma. Otherwise, the * returned ptep could go away if part of a shared pmd and * another thread calls huge_pmd_unshare. */ hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = hugetlb_walk(src_vma, addr, sz); if (!src_pte) { addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; } /* * If the pagetables are shared don't copy or take references. * * dst_pte == src_pte is the common case of src/dest sharing. * However, src could have 'unshared' and dst shares with * another vma. So page_count of ptep page is checked instead * to reliably determine whether pte is shared. */ if (page_count(virt_to_page(dst_pte)) > 1) { addr |= last_addr_mask; continue; } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { /* * Skip if src entry none. */ ; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); bool uffd_wp = pte_swp_uffd_wp(entry); if (!is_readable_migration_entry(swp_entry) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ swp_entry = make_readable_migration_entry( swp_offset(swp_entry)); entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_pte_marker(entry))) { pte_marker marker = copy_pte_marker( pte_to_swp_entry(entry), dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, make_pte_marker(marker), sz); } else { entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); pte_folio = page_folio(pte_page(entry)); folio_get(pte_folio); /* * Failing to duplicate the anon rmap is a rare case * where we see pinned hugetlb pages while they're * prone to COW. We need to do the COW earlier during * fork. * * When pre-allocating the page or copying data, we * need to be without the pgtable locks since we could * sleep during the process. */ if (!folio_test_anon(pte_folio)) { hugetlb_add_file_rmap(pte_folio); } else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) { pte_t src_pte_old = entry; struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } ret = copy_user_large_folio(new_folio, pte_folio, addr, dst_vma); folio_put(pte_folio); if (ret) { folio_put(new_folio); break; } /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, new_folio); folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; } if (cow) { /* * No need to notify as we are downgrading page * table protection not changing it to point * to a new page. * * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); } if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } else { hugetlb_vma_unlock_read(src_vma); } return ret; } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; pte_t pte; dst_ptl = huge_pte_lock(h, mm, dst_pte); src_ptl = huge_pte_lockptr(h, mm, src_pte); /* * We don't have to worry about the ordering of src and dst ptlocks * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock. */ if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = huge_pte_clear_uffd_wp(pte); else if (is_swap_pte(pte)) pte = pte_swp_clear_uffd_wp(pte); } set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); } if (src_ptl != dst_ptl) spin_unlock(src_ptl); spin_unlock(dst_ptl); } int move_hugetlb_page_tables(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long len) { struct hstate *h = hstate_vma(vma); struct address_space *mapping = vma->vm_file->f_mapping; unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* * In case of shared PMDs, we should cover the maximum possible * range. */ flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = hugetlb_walk(vma, old_addr, sz); if (!src_pte) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; } dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); if (!dst_pte) break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); } if (shared_pmd) flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); bool adjust_reservation = false; unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); /* * This is a hugetlb vma, all the pte entries should point * to huge page. */ tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) { address |= last_addr_mask; continue; } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; address |= last_addr_mask; continue; } pte = huge_ptep_get(mm, address, ptep); if (huge_pte_none(pte)) { spin_unlock(ptl); continue; } /* * Migrating hugepage or HWPoisoned hugepage is already * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to * drop the uffd-wp bit in this zap, then replace the * pte with a marker. */ if (pte_swp_uffd_wp_any(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); else huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } page = pte_page(pte); /* * If a reference page is supplied, it is because a specific * page is being unmapped, not a range. Ensure the page we * are about to unmap is the actual page of interest. */ if (ref_page) { if (page != ref_page) { spin_unlock(ptl); continue; } /* * Mark the VMA as having unmapped its page so that * future faults in this VMA will fail rather than * looking like data was lost */ set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(page_folio(page)); /* * Restore the reservation for anonymous page, otherwise the * backing page could be stolen by someone. * If there we are freeing a surplus, do not set the restore * reservation bit. */ if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(page_folio(page))) { folio_set_hugetlb_restore_reserve(page_folio(page)); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } spin_unlock(ptl); /* * Adjust the reservation for the region that will have the * reserve restored. Keep in mind that vma_needs_reservation() changes * resv->adds_in_progress if it succeeds. If this is not done, * do_exit() will not see it, and will keep the reservation * forever. */ if (adjust_reservation) { int rc = vma_needs_reservation(h, vma, address); if (rc < 0) /* Pressumably allocate_file_region_entries failed * to allocate a file_region struct. Clear * hugetlb_restore_reserve so that global reserve * count will not be incremented by free_huge_folio. * Act as if we consumed the reservation. */ folio_clear_hugetlb_restore_reserve(page_folio(page)); else if (rc) vma_add_reservation(h, vma, address); } tlb_remove_page_size(tlb, page, huge_page_size(h)); /* * Bail out after unmapping reference page if supplied */ if (ref_page) break; } tlb_end_vma(tlb, vma); /* * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We * could defer the flush until now, since by holding i_mmap_rwsem we * guaranteed that the last refernece would not be dropped. But we must * do the flushing before we return, as otherwise i_mmap_rwsem will be * dropped and the last reference to the shared PMDs page might be * dropped as well. * * In theory we could defer the freeing of the PMD pages as well, but * huge_pmd_unshare() relies on the exact page_count for the PMD page to * detect sharing, so we cannot defer the release of the page either. * Instead, do flush now. */ if (force_flush) tlb_flush_mmu_tlbonly(tlb); } void __hugetlb_zap_begin(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; adjust_range_if_pmd_sharing_possible(vma, start, end); hugetlb_vma_lock_write(vma); if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); } void __hugetlb_zap_end(struct vm_area_struct *vma, struct zap_details *details) { zap_flags_t zap_flags = details ? details->zap_flags : 0; if (!vma->vm_file) /* hugetlbfs_file_mmap error */ return; if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ /* * Unlock and free the vma lock before releasing i_mmap_rwsem. * When the vma_lock is freed, this makes the vma ineligible * for pmd sharing. And, i_mmap_rwsem is required to set up * pmd sharing. This is important as page tables for this * unmapped range will be asynchrously deleted. If the page * tables are shared, there will be issues when accessed by * someone else. */ __hugetlb_vma_unlock_write_free(vma); } else { hugetlb_vma_unlock_write(vma); } if (vma->vm_file) i_mmap_unlock_write(vma->vm_file->f_mapping); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } /* * This is called when the original mapper is failing to COW a MAP_PRIVATE * mapping it owns the reserve page for. The intention is to unmap the page * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; pgoff_t pgoff; /* * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; mapping = vma->vm_file->f_mapping; /* * Take the mapping lock for the duration of the table walk. As * this mapping should be shared between all the VMAs, * __unmap_hugepage_range() is called as the lock is already held */ i_mmap_lock_write(mapping); vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) continue; /* * Shared VMAs have their own reserves and do not affect * MAP_PRIVATE accounting but it is possible that a shared * VMA is using the same page so check and skip such VMAs. */ if (iter_vma->vm_flags & VM_MAYSHARE) continue; /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these * areas. This is because a future no-page fault on this VMA * could insert a zeroed page instead of the data existing * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page, 0); } i_mmap_unlock_write(mapping); } /* * hugetlb_wp() should be called with page lock of the original hugepage held. * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; bool cow_from_owner = 0; vm_fault_t ret = 0; struct mmu_notifier_range range; /* * Never handle CoW for uffd-wp protected pages. It should be only * handled when the uffd-wp protection is removed. * * Note that only the CoW optimization path (in hugetlb_no_page()) * can trigger this, because hugetlb_fault() will always resolve * uffd-wp bit first. */ if (!unshare && huge_pte_uffd_wp(pte)) return 0; /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } old_folio = page_folio(pte_page(pte)); delayacct_wpcopy_start(); retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. * * Note that we don't rely on the (safer) folio refcount here, because * copying the hugetlb folio when there are unexpected (temporary) * folio references could harm simple fork()+exit() users when * we run out of free hugetlb folios: we would have to kill processes * in scenarios that used to work. As a side effect, there can still * be leaks between processes, for example, with FOLL_GET users. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) set_huge_ptep_maybe_writable(vma, vmf->address, vmf->pte); delayacct_wpcopy_end(); return 0; } VM_BUG_ON_PAGE(folio_test_anon(old_folio) && PageAnonExclusive(&old_folio->page), &old_folio->page); /* * If the process that created a MAP_PRIVATE mapping is about to * perform a COW due to a shared page count, attempt to satisfy * the allocation without using the existing reserves. The pagecache * page is used to determine if the reserve at this address was * consumed or not. If reserves were used, a partial faulted mapping * at the time of fork() could consume its reserves on COW instead * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) cow_from_owner = true; folio_get(old_folio); /* * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ spin_unlock(vmf->ptl); new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient * huge page pool. To guarantee the original mappers * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; folio_put(old_folio); /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock * in write mode. Dropping vma_lock in read mode * here is OK as COW mappings do not interact with * PMD sharing. * * Reacquire both after unmap operation. */ idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); unmap_ref_private(mm, vma, &old_folio->page, vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table * lock, and our job is done. */ delayacct_wpcopy_end(); return 0; } ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } /* * When the original hugepage is shared one, it does not have * anon_vma prepared. */ ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out_release_all; if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_release_all; } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ spin_lock(vmf->ptl); vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* * No restore in case of successful pagetable update (Break COW or * unshare) */ if (new_folio != old_folio) restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; } /* * Return whether there is a pagecache page to back given address within VMA. */ bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); if (IS_ERR(folio)) return false; folio_put(folio); return true; } int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); if (unlikely(err)) { __folio_clear_locked(folio); return err; } folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file * by non-hugetlbfs specific code paths. */ folio_mark_dirty(folio); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); return 0; } static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, unsigned long reason) { u32 hash; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ hugetlb_vma_unlock_read(vmf->vma); hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(vmf, reason); } /* * Recheck pte with pgtable lock. Returns true if pte didn't change, or * false if pte changed or is changing. */ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t old_pte) { spinlock_t *ptl; bool same; ptl = huge_pte_lock(h, mm, ptep); same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte); spin_unlock(ptl); return same; } static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed * COW/unsharing. Warn that such a situation has occurred as it may not * be obvious. */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); goto out; } /* * Use page lock to guard against racing truncation * before we get page_table_lock. */ new_folio = false; folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { /* * Since hugetlb_no_page() was examining pte * without pgtable lock, we need to re-test under * lock because the pte may not be stable and could * have changed from under us. Try to detect * either changed or during-changing ptes and retry * properly when needed. * * Note that userfaultfd is actually fine with * false positives (e.g. caused by pte changed), * but not wrong logical events (e.g. caused by * reading a pte during changing). The latter can * confuse the userspace, so the strictness is very * much preferred. E.g., MISSING event should * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } if (!(vma->vm_flags & VM_MAYSHARE)) { ret = __vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; } folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two * tasks from racing to fault in the same page which * could result in false unable to allocate errors. * Page migration does not take the fault mutex, but * does a clear then write of pte's under page table * lock. Page fault code could race with migration, * notice the clear pte and try to allocate a page * here. Before returning error, get ptl and make * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } folio_zero_user(folio, vmf->real_address); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { int err = hugetlb_add_to_page_cache(folio, mapping, vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone * else consumed the reservation since hugetlb * fault mutex is held when add a hugetlb page * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ restore_reserve_on_error(h, vma, vmf->address, folio); folio_put(folio); ret = VM_FAULT_SIGBUS; goto out; } new_pagecache_folio = true; } else { folio_lock(folio); anon_rmap = 1; } } else { /* * If memory error occurs between mmap() and fault, some process * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside * the spinlock. */ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf->address); } vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_wp(folio, vmf); } spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ if (new_folio) folio_set_hugetlb_migratable(folio); folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); /* * We must check to release the per-VMA lock. __vmf_anon_prepare() is * the only way ret can be set to VM_FAULT_RETRY. */ if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: spin_unlock(vmf->ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); goto out; } #ifdef CONFIG_SMP u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; key[0] = (unsigned long) mapping; key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } #else /* * For uniprocessor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } #endif vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { vm_fault_t ret; u32 hash; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; struct vm_fault vmf = { .vma = vma, .address = address & huge_page_mask(h), .real_address = address, .flags = flags, .pgoff = vma_hugecache_offset(h, vma, address & huge_page_mask(h)), /* TODO: Track hugetlb faults using vm_fault */ /* * Some fields may not be initialized, be careful as it may * be hard to debug if called functions make assumptions */ }; /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Acquire vma lock before calling huge_pte_alloc and hold * until finished with vmf.pte. This prevents huge_pmd_unshare from * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); if (huge_pte_none_mostly(vmf.orig_pte)) { if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { /* This isn't supported in hugetlb. */ ret = VM_FAULT_SIGSEGV; goto out_mutex; } } /* * Other PTE markers should be handled the same way as none PTE. * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mapping, &vmf); } ret = 0; /* * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this * point, so this check prevents the kernel from going below assuming * that we have an active hugepage in pagecache. This goto expects * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) * check will properly handle it. */ if (!pte_present(vmf.orig_pte)) { if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the * huge_pte_lockptr() later in * migration_entry_wait_huge(). The vma lock will * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; } /* * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. Also lookup the pagecache page now as it is used to * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf.address); pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, vmf.pgoff); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); } vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte); set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte, huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } /* * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ folio = page_folio(pte_page(vmf.orig_pte)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; goto out_ptl; } folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) { ret = hugetlb_wp(pagecache_folio, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } vmf.orig_pte = pte_mkyoung(vmf.orig_pte); if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, vmf.address, vmf.pte); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); /* * We must check to release the per-VMA lock. __vmf_anon_prepare() in * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY. */ if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and * the page is not used after unlocked before returning from the current * page fault. So we are safe from accessing freed page, even if we wait * here without taking refcount. */ if (need_wait_lock) folio_wait_locked(folio); return ret; } #ifdef CONFIG_USERFAULTFD /* * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). */ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; nodemask_t *nodemask; struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); /* * This is used to allocate a temporary hugetlb to hold the copied * content, which will then be copied again to the final hugetlb * consuming a reservation. Set the alloc_fallback to false to indicate * that breaking the per-node hugetlb pool is not allowed in this case. */ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; } /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. */ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { struct mm_struct *dst_mm = dst_vma->vm_mm; bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE); bool wp_enabled = (flags & MFILL_ATOMIC_WP); struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); unsigned long size = huge_page_size(h); int vm_shared = dst_vma->vm_flags & VM_SHARED; pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); /* Don't overwrite any existing PTEs (even markers) */ if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { spin_unlock(ptl); return -EEXIST; } _dst_pte = make_pte_marker(PTE_MARKER_POISONED); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); return 0; } if (is_continue) { ret = -EFAULT; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; } else if (!*foliop) { /* If a folio already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. */ if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { ret = -EEXIST; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } ret = copy_folio_from_user(folio, (const void __user *) src_addr, false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; /* Free the allocated folio which may have * consumed a reservation. */ restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); /* Allocate a temporary folio to hold the copied * contents. */ folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); if (!folio) { ret = -ENOMEM; goto out; } *foliop = folio; /* Set the outparam foliop and return to the caller to * copy the contents outside the lock. Don't free the * folio. */ goto out; } } else { if (vm_shared && hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { folio_put(*foliop); ret = -EEXIST; *foliop = NULL; goto out; } folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; *foliop = NULL; goto out; } ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma); folio_put(*foliop); *foliop = NULL; if (ret) { folio_put(folio); goto out; } } /* * If we just allocated a new page, we need a memory barrier to ensure * that preceding stores to the page become visible before the * set_pte_at() write. The memory barrier inside __folio_mark_uptodate * is what we need. * * In the case where we have not allocated a new page (is_continue), * the page must already be uptodate. UFFDIO_CONTINUE already includes * an earlier smp_wmb() to ensure that prior stores will be visible * before the set_pte_at() write. */ if (!is_continue) __folio_mark_uptodate(folio); else WARN_ON_ONCE(!folio_test_uptodate(folio)); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { ret = -EFAULT; if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h))) goto out_release_nounlock; /* * Serialization between remove_inode_hugepages() and * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; if (folio_test_hwpoison(folio)) goto out_release_unlock; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) goto out_release_unlock; if (folio_in_pagecache) hugetlb_add_file_rmap(folio); else hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ _dst_pte = make_huge_pte(dst_vma, &folio->page, !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not * supported, but we should still be clear in that this page cannot be * thrown away at will, even if write bit not set. */ _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); if (!is_continue) folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) folio_unlock(folio); out_release_nounlock: if (!folio_in_pagecache) restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long start = address; pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * In the case of shared PMDs, the area to flush could be beyond * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { address |= last_addr_mask; continue; } /* * Userfaultfd wr-protect requires pgtable * pre-allocations to install pte markers. */ ptep = huge_pte_alloc(mm, vma, address, psize); if (!ptep) { pages = -ENOMEM; break; } } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it * happened due to some reason. */ WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); shared_pmd = true; address |= last_addr_mask; continue; } pte = huge_ptep_get(mm, address, ptep); if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); struct page *page = pfn_swap_entry_to_page(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { if (PageAnon(page)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else entry = make_readable_migration_entry( swp_offset(entry)); newpte = swp_entry_to_pte(entry); pages++; } if (uffd_wp) newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { /* * Do nothing on a poison marker; page is * corrupted, permissons do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (uffd_wp) pte = huge_pte_mkuffd_wp(pte); else if (uffd_wp_resolve) pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; } else { /* None pte */ if (unlikely(uffd_wp)) /* Safe to modify directly (none->non-present). */ set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP), psize); } spin_unlock(ptl); } /* * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: * once we release i_mmap_rwsem, another task can do the final put_page * and that page table be reused and filled with junk. If we actually * did unshare a page of pmds, flush the range corresponding to the pud. */ if (shared_pmd) flush_hugetlb_tlb_range(vma, range.start, range.end); else flush_hugetlb_tlb_range(vma, start, end); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new * page. * * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages > 0 ? (pages << h->order) : pages; } /* Return true if reservation was successful, false otherwise. */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) { long chg = -1, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; struct hugetlb_cgroup *h_cg = NULL; long gbl_reserve, regions_needed = 0; /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); return false; } /* * vma specific semaphore used for pmd sharing and fault/truncation * synchronization */ hugetlb_vma_lock_alloc(vma); /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves */ if (vm_flags & VM_NORESERVE) return true; /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see * hugetlbfs_get_inode). */ resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, &regions_needed); } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) goto out_err; chg = to - from; set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) goto out_err; if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); } /* * There must be enough pages in the subpool for the mapping. If * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); if (gbl_reserve < 0) goto out_uncharge_cgroup; /* * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ if (hugetlb_acct_memory(h, gbl_reserve) < 0) goto out_put_pages; /* * Account for the reservations made. Shared mappings record regions * that have reservations as they are shared by multiple VMAs. * When the last VMA disappears, the region map says how much * the reservation was and the page cache tells how much of * the reservation was consumed. Private mappings are per-VMA and * only the consumed reservations are tracked. When the VMA * disappears, the original reservation is the VMA size and the * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); goto out_put_pages; } else if (unlikely(chg > add)) { /* * pages in this range were added to the reserve * map between region_chg and region_add. This * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ long rsv_adjust; /* * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the * reference to h_cg->css. See comment below for detail. */ hugetlb_cgroup_uncharge_cgroup_rsvd( hstate_index(h), (chg - add) * pages_per_huge_page(h), h_cg); rsv_adjust = hugepage_subpool_put_pages(spool, chg - add); hugetlb_acct_memory(h, -rsv_adjust); } else if (h_cg) { /* * The file_regions will hold their own reference to * h_cg->css. So we should release the reference held * via hugetlb_cgroup_charge_cgroup_rsvd() when we are * done. */ hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } return true; out_put_pages: /* put back original number of pages, chg */ (void)hugepage_subpool_put_pages(spool, chg); out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); set_vma_resv_map(vma, NULL); } return false; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed) { struct hstate *h = hstate_inode(inode); struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; /* * Since this routine can be called in the evict inode path for all * hugetlbfs inodes, resv_map could be NULL. */ if (resv_map) { chg = region_del(resv_map, start, end); /* * region_del() can fail in the rare case where a region * must be split and another region descriptor can not be * allocated. If end == LONG_MAX, it will not fail. */ if (chg < 0) return chg; } spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. * * Note that !resv_map implies freed == 0. So (chg - freed) * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); return 0; } #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) { unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + svma->vm_start; unsigned long sbase = saddr & PUD_MASK; unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the * page table page. * * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || !range_in_vma(svma, sbase, s_end) || !svma->vm_private_data) return 0; return saddr; } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif /* * check on proper vm_flags and page table alignment */ if (!(vma->vm_flags & VM_MAYSHARE)) return false; if (!vma->vm_private_data) /* vma lock required for sharing */ return false; if (!range_in_vma(vma, start, end)) return false; return true; } /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible * shared pmd mappings. */ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); /* * vma needs to span at least one aligned PUD size, and the range * must be at least partially within in. */ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || (*end <= v_start) || (*start >= v_end)) return; /* Extend the range to be PUD aligned for a worst case scenario */ if (*start > v_start) *start = ALIGN_DOWN(*start, PUD_SIZE); if (*end < v_end) *end = ALIGN(*end, PUD_SIZE); } /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the * code much cleaner. pmd allocation is essential for the shared case because * pud has to be populated inside the same i_mmap_rwsem section - otherwise * racing tasks could either miss the sharing (see huge_pte_offset) or select a * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; struct vm_area_struct *svma; unsigned long saddr; pte_t *spte = NULL; pte_t *pte; i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } } if (!spte) goto out; spin_lock(&mm->page_table_lock); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); i_mmap_unlock_read(mapping); return pte; } /* * unmap huge page backed by shared pte. * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); if (sz != PMD_SIZE) return 0; if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { return NULL; } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (pud) { if (sz == PUD_SIZE) { pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); if (want_pmd_share(vma, addr) && pud_none(*pud)) pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } } if (pte) { pte_t pteval = ptep_get_lockless(pte); BUG_ON(pte_present(pteval) && !pte_huge(pteval)); } return pte; } /* * huge_pte_offset() - Walk the page table to resolve the hugepage * entry at address @addr * * Return: Pointer to page table entry (PUD or PMD) for * address @addr, or NULL if a !p*d_present() entry is encountered and the * size @sz doesn't match the hugepage size at this level of the page * table. */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) return NULL; p4d = p4d_offset(pgd, addr); if (!p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, addr); if (sz == PUD_SIZE) /* must be pud huge, non-present or none */ return (pte_t *)pud; if (!pud_present(*pud)) return NULL; /* must have a valid entry and size to go further */ pmd = pmd_offset(pud, addr); /* must be pmd huge, non-present or none */ return (pte_t *)pmd; } /* * Return a mask that can be used to update an address to the last huge * page in a page table page mapping size. Used to skip non-present * page table entries when linearly scanning address ranges. Architectures * with unique huge page to page table relationships can define their own * version of this routine. */ unsigned long hugetlb_mask_last_page(struct hstate *h) { unsigned long hp_size = huge_page_size(h); if (hp_size == PUD_SIZE) return P4D_SIZE - PUD_SIZE; else if (hp_size == PMD_SIZE) return PUD_SIZE - PMD_SIZE; else return 0UL; } #else /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif return 0UL; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /** * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio * @folio: the folio to isolate * @list: the list to add the folio to on success * * Isolate an allocated (refcount > 0) hugetlb folio, marking it as * isolated/non-migratable, and moving it from the active list to the * given list. * * Isolation will fail if @folio is not an allocated hugetlb folio, or if * it is already isolated/non-migratable. * * On success, an additional folio reference is taken that must be dropped * using folio_putback_hugetlb() to undo the isolation. * * Return: True if isolation worked, otherwise False. */ bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(folio) || !folio_test_hugetlb_migratable(folio) || !folio_try_get(folio)) { ret = false; goto unlock; } folio_clear_hugetlb_migratable(folio); list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; } int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); if (folio_test_hugetlb(folio)) { *hugetlb = true; if (folio_test_hugetlb_freed(folio)) ret = 0; else if (folio_test_hugetlb_migratable(folio) || unpoison) ret = folio_try_get(folio); else ret = -EBUSY; } spin_unlock_irq(&hugetlb_lock); return ret; } int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } /** * folio_putback_hugetlb - unisolate a hugetlb folio * @folio: the isolated hugetlb folio * * Putback/un-isolate the hugetlb folio that was previous isolated using * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it * back onto the active list. * * Will drop the additional folio reference obtained through * folio_isolate_hugetlb(). */ void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { struct hstate *h = folio_hstate(old_folio); hugetlb_cgroup_migrate(old_folio, new_folio); set_page_owner_migrate_reason(&new_folio->page, reason); /* * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. * * Also note that we have to transfer the per-node surplus state * here as well otherwise the global surplus count will not match * the per-node's. */ if (folio_test_hugetlb_temporary(new_folio)) { int old_nid = folio_nid(old_folio); int new_nid = folio_nid(new_folio); folio_set_hugetlb_temporary(old_folio); folio_clear_hugetlb_temporary(new_folio); /* * There is no need to transfer the per-node surplus state * when we do not cross the node. */ if (new_nid == old_nid) return; spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } spin_unlock_irq(&hugetlb_lock); } /* * Our old folio is isolated and has "migratable" cleared until it * is putback. As migration succeeded, set the new folio "migratable" * and add it to the active list. */ spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(new_folio); list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; unsigned long address; spinlock_t *ptl; pte_t *ptep; if (!(vma->vm_flags & VM_MAYSHARE)) return; if (start >= end) return; flush_cache_range(vma, start, end); /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } /* * This function will unconditionally remove all the shared pmd pgtable entries * within the specific vma for a hugetlbfs memory range. */ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ALIGN_DOWN(vma->vm_end, PUD_SIZE)); }
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/macsec.c - MACsec device * * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/module.h> #include <crypto/aead.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/refcount.h> #include <net/genetlink.h> #include <net/sock.h> #include <net/gro_cells.h> #include <net/macsec.h> #include <net/dst_metadata.h> #include <net/netdev_lock.h> #include <linux/phy.h> #include <linux/byteorder/generic.h> #include <linux/if_arp.h> #include <uapi/linux/if_macsec.h> /* SecTAG length = macsec_eth_header without the optional SCI */ #define MACSEC_TAG_LEN 6 struct macsec_eth_header { struct ethhdr eth; /* SecTAG */ u8 tci_an; #if defined(__LITTLE_ENDIAN_BITFIELD) u8 short_length:6, unused:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 unused:2, short_length:6; #else #error "Please fix <asm/byteorder.h>" #endif __be32 packet_number; u8 secure_channel_id[8]; /* optional */ } __packed; /* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */ #define MIN_NON_SHORT_LEN 48 #define GCM_AES_IV_LEN 12 #define for_each_rxsc(secy, sc) \ for (sc = rcu_dereference_bh(secy->rx_sc); \ sc; \ sc = rcu_dereference_bh(sc->next)) #define for_each_rxsc_rtnl(secy, sc) \ for (sc = rtnl_dereference(secy->rx_sc); \ sc; \ sc = rtnl_dereference(sc->next)) #define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31))) struct gcm_iv_xpn { union { u8 short_secure_channel_id[4]; ssci_t ssci; }; __be64 pn; } __packed; struct gcm_iv { union { u8 secure_channel_id[8]; sci_t sci; }; __be32 pn; }; #define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT struct pcpu_secy_stats { struct macsec_dev_stats stats; struct u64_stats_sync syncp; }; /** * struct macsec_dev - private data * @secy: SecY config * @real_dev: pointer to underlying netdevice * @dev_tracker: refcount tracker for @real_dev reference * @stats: MACsec device stats * @secys: linked list of SecY's on the underlying device * @gro_cells: pointer to the Generic Receive Offload cell * @offload: status of offloading on the MACsec device * @insert_tx_tag: when offloading, device requires to insert an * additional tag */ struct macsec_dev { struct macsec_secy secy; struct net_device *real_dev; netdevice_tracker dev_tracker; struct pcpu_secy_stats __percpu *stats; struct list_head secys; struct gro_cells gro_cells; enum macsec_offload offload; bool insert_tx_tag; }; /** * struct macsec_rxh_data - rx_handler private argument * @secys: linked list of SecY's on this underlying device */ struct macsec_rxh_data { struct list_head secys; }; static struct macsec_dev *macsec_priv(const struct net_device *dev) { return (struct macsec_dev *)netdev_priv(dev); } static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev) { return rcu_dereference_bh(dev->rx_handler_data); } static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } struct macsec_cb { struct aead_request *req; union { struct macsec_tx_sa *tx_sa; struct macsec_rx_sa *rx_sa; }; u8 assoc_num; bool valid; bool has_sci; }; static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr) { struct macsec_rx_sa *sa = rcu_dereference_bh(ptr); if (!sa || !sa->active) return NULL; if (!refcount_inc_not_zero(&sa->refcnt)) return NULL; return sa; } static void free_rx_sc_rcu(struct rcu_head *head) { struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head); free_percpu(rx_sc->stats); kfree(rx_sc); } static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc) { return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL; } static void macsec_rxsc_put(struct macsec_rx_sc *sc) { if (refcount_dec_and_test(&sc->refcnt)) call_rcu(&sc->rcu_head, free_rx_sc_rcu); } static void free_rxsa(struct rcu_head *head) { struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); kfree(sa); } static void macsec_rxsa_put(struct macsec_rx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) call_rcu(&sa->rcu, free_rxsa); } static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) { struct macsec_tx_sa *sa = rcu_dereference_bh(ptr); if (!sa || !sa->active) return NULL; if (!refcount_inc_not_zero(&sa->refcnt)) return NULL; return sa; } static void free_txsa(struct rcu_head *head) { struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); kfree(sa); } static void macsec_txsa_put(struct macsec_tx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) call_rcu(&sa->rcu, free_txsa); } static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb)); return (struct macsec_cb *)skb->cb; } #define MACSEC_PORT_SCB (0x0000) #define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL) #define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff) #define MACSEC_GCM_AES_128_SAK_LEN 16 #define MACSEC_GCM_AES_256_SAK_LEN 32 #define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN #define DEFAULT_XPN false #define DEFAULT_SEND_SCI true #define DEFAULT_ENCRYPT false #define DEFAULT_ENCODING_SA 0 #define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1)) static sci_t make_sci(const u8 *addr, __be16 port) { sci_t sci; memcpy(&sci, addr, ETH_ALEN); memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port)); return sci; } static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present) { sci_t sci; if (sci_present) memcpy(&sci, hdr->secure_channel_id, sizeof(hdr->secure_channel_id)); else sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES); return sci; } static unsigned int macsec_sectag_len(bool sci_present) { return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0); } static unsigned int macsec_hdr_len(bool sci_present) { return macsec_sectag_len(sci_present) + ETH_HLEN; } static unsigned int macsec_extra_len(bool sci_present) { return macsec_sectag_len(sci_present) + sizeof(__be16); } /* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */ static void macsec_fill_sectag(struct macsec_eth_header *h, const struct macsec_secy *secy, u32 pn, bool sci_present) { const struct macsec_tx_sc *tx_sc = &secy->tx_sc; memset(&h->tci_an, 0, macsec_sectag_len(sci_present)); h->eth.h_proto = htons(ETH_P_MACSEC); if (sci_present) { h->tci_an |= MACSEC_TCI_SC; memcpy(&h->secure_channel_id, &secy->sci, sizeof(h->secure_channel_id)); } else { if (tx_sc->end_station) h->tci_an |= MACSEC_TCI_ES; if (tx_sc->scb) h->tci_an |= MACSEC_TCI_SCB; } h->packet_number = htonl(pn); /* with GCM, C/E clear for !encrypt, both set for encrypt */ if (tx_sc->encrypt) h->tci_an |= MACSEC_TCI_CONFID; else if (secy->icv_len != MACSEC_DEFAULT_ICV_LEN) h->tci_an |= MACSEC_TCI_C; h->tci_an |= tx_sc->encoding_sa; } static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len) { if (data_len < MIN_NON_SHORT_LEN) h->short_length = data_len; } /* Checks if a MACsec interface is being offloaded to an hardware engine */ static bool macsec_is_offloaded(struct macsec_dev *macsec) { if (macsec->offload == MACSEC_OFFLOAD_MAC || macsec->offload == MACSEC_OFFLOAD_PHY) return true; return false; } /* Checks if underlying layers implement MACsec offloading functions. */ static bool macsec_check_offload(enum macsec_offload offload, struct macsec_dev *macsec) { if (!macsec || !macsec->real_dev) return false; if (offload == MACSEC_OFFLOAD_PHY) return macsec->real_dev->phydev && macsec->real_dev->phydev->macsec_ops; else if (offload == MACSEC_OFFLOAD_MAC) return macsec->real_dev->features & NETIF_F_HW_MACSEC && macsec->real_dev->macsec_ops; return false; } static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload, struct macsec_dev *macsec, struct macsec_context *ctx) { if (ctx) { memset(ctx, 0, sizeof(*ctx)); ctx->offload = offload; if (offload == MACSEC_OFFLOAD_PHY) ctx->phydev = macsec->real_dev->phydev; else if (offload == MACSEC_OFFLOAD_MAC) ctx->netdev = macsec->real_dev; } if (offload == MACSEC_OFFLOAD_PHY) return macsec->real_dev->phydev->macsec_ops; else return macsec->real_dev->macsec_ops; } /* Returns a pointer to the MACsec ops struct if any and updates the MACsec * context device reference if provided. */ static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec, struct macsec_context *ctx) { if (!macsec_check_offload(macsec->offload, macsec)) return NULL; return __macsec_get_ops(macsec->offload, macsec, ctx); } /* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */ static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn) { struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data; int len = skb->len - 2 * ETH_ALEN; int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len; /* a) It comprises at least 17 octets */ if (skb->len <= 16) return false; /* b) MACsec EtherType: already checked */ /* c) V bit is clear */ if (h->tci_an & MACSEC_TCI_VERSION) return false; /* d) ES or SCB => !SC */ if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) && (h->tci_an & MACSEC_TCI_SC)) return false; /* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */ if (h->unused) return false; /* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */ if (!h->packet_number && !xpn) return false; /* length check, f) g) h) i) */ if (h->short_length) return len == extra_len + h->short_length; return len >= extra_len + MIN_NON_SHORT_LEN; } #define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true)) #define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn, salt_t salt) { struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv; gcm_iv->ssci = ssci ^ salt.ssci; gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn; } static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn) { struct gcm_iv *gcm_iv = (struct gcm_iv *)iv; gcm_iv->sci = sci; gcm_iv->pn = htonl(pn); } static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb) { return (struct macsec_eth_header *)skb_mac_header(skb); } static void __macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) { pr_debug("PN wrapped, transitioning to !oper\n"); tx_sa->active = false; if (secy->protect_frames) secy->operational = false; } void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) { spin_lock_bh(&tx_sa->lock); __macsec_pn_wrapped(secy, tx_sa); spin_unlock_bh(&tx_sa->lock); } EXPORT_SYMBOL_GPL(macsec_pn_wrapped); static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa, struct macsec_secy *secy) { pn_t pn; spin_lock_bh(&tx_sa->lock); pn = tx_sa->next_pn_halves; if (secy->xpn) tx_sa->next_pn++; else tx_sa->next_pn_halves.lower++; if (tx_sa->next_pn == 0) __macsec_pn_wrapped(secy, tx_sa); spin_unlock_bh(&tx_sa->lock); return pn; } static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = netdev_priv(dev); skb->dev = macsec->real_dev; skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; } static unsigned int macsec_msdu_len(struct sk_buff *skb) { struct macsec_dev *macsec = macsec_priv(skb->dev); struct macsec_secy *secy = &macsec->secy; bool sci_present = macsec_skb_cb(skb)->has_sci; return skb->len - macsec_hdr_len(sci_present) - secy->icv_len; } static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc, struct macsec_tx_sa *tx_sa) { unsigned int msdu_len = macsec_msdu_len(skb); struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats); u64_stats_update_begin(&txsc_stats->syncp); if (tx_sc->encrypt) { txsc_stats->stats.OutOctetsEncrypted += msdu_len; txsc_stats->stats.OutPktsEncrypted++; this_cpu_inc(tx_sa->stats->OutPktsEncrypted); } else { txsc_stats->stats.OutOctetsProtected += msdu_len; txsc_stats->stats.OutPktsProtected++; this_cpu_inc(tx_sa->stats->OutPktsProtected); } u64_stats_update_end(&txsc_stats->syncp); } static void count_tx(struct net_device *dev, int ret, int len) { if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_sw_netstats_tx_add(dev, 1, len); } static void macsec_encrypt_done(void *data, int err) { struct sk_buff *skb = data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa; int len, ret; aead_request_free(macsec_skb_cb(skb)->req); rcu_read_lock_bh(); macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); /* packet is encrypted/protected so tx_bytes must be calculated */ len = macsec_msdu_len(skb) + 2 * ETH_ALEN; macsec_encrypt_finish(skb, dev); ret = dev_queue_xmit(skb); count_tx(dev, ret, len); rcu_read_unlock_bh(); macsec_txsa_put(sa); dev_put(dev); } static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm, unsigned char **iv, struct scatterlist **sg, int num_frags) { size_t size, iv_offset, sg_offset; struct aead_request *req; void *tmp; size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm); iv_offset = size; size += GCM_AES_IV_LEN; size = ALIGN(size, __alignof__(struct scatterlist)); sg_offset = size; size += sizeof(struct scatterlist) * num_frags; tmp = kmalloc(size, GFP_ATOMIC); if (!tmp) return NULL; *iv = (unsigned char *)(tmp + iv_offset); *sg = (struct scatterlist *)(tmp + sg_offset); req = tmp; aead_request_set_tfm(req, tfm); return req; } static struct sk_buff *macsec_encrypt(struct sk_buff *skb, struct net_device *dev) { int ret; struct scatterlist *sg; struct sk_buff *trailer; unsigned char *iv; struct ethhdr *eth; struct macsec_eth_header *hh; size_t unprotected_len; struct aead_request *req; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; struct macsec_dev *macsec = macsec_priv(dev); bool sci_present; pn_t pn; secy = &macsec->secy; tx_sc = &secy->tx_sc; /* 10.5.1 TX SA assignment */ tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]); if (!tx_sa) { secy->operational = false; kfree_skb(skb); return ERR_PTR(-EINVAL); } if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM || skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) { struct sk_buff *nskb = skb_copy_expand(skb, MACSEC_NEEDED_HEADROOM, MACSEC_NEEDED_TAILROOM, GFP_ATOMIC); if (likely(nskb)) { consume_skb(skb); skb = nskb; } else { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOMEM); } } else { skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { macsec_txsa_put(tx_sa); return ERR_PTR(-ENOMEM); } } unprotected_len = skb->len; eth = eth_hdr(skb); sci_present = macsec_send_sci(secy); hh = skb_push(skb, macsec_extra_len(sci_present)); memmove(hh, eth, 2 * ETH_ALEN); pn = tx_sa_update_pn(tx_sa, secy); if (pn.full64 == 0) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOLINK); } macsec_fill_sectag(hh, secy, pn.lower, sci_present); macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN); skb_put(skb, secy->icv_len); if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) { struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.OutPktsTooLong++; u64_stats_update_end(&secy_stats->syncp); macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-EINVAL); } ret = skb_cow_data(skb, 0, &trailer); if (unlikely(ret < 0)) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(ret); } req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret); if (!req) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOMEM); } if (secy->xpn) macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt); else macsec_fill_iv(iv, secy->sci, pn.lower); sg_init_table(sg, ret); ret = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(ret < 0)) { aead_request_free(req); macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(ret); } if (tx_sc->encrypt) { int len = skb->len - macsec_hdr_len(sci_present) - secy->icv_len; aead_request_set_crypt(req, sg, sg, len, iv); aead_request_set_ad(req, macsec_hdr_len(sci_present)); } else { aead_request_set_crypt(req, sg, sg, 0, iv); aead_request_set_ad(req, skb->len - secy->icv_len); } macsec_skb_cb(skb)->req = req; macsec_skb_cb(skb)->tx_sa = tx_sa; macsec_skb_cb(skb)->has_sci = sci_present; aead_request_set_callback(req, 0, macsec_encrypt_done, skb); dev_hold(skb->dev); ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS) { return ERR_PTR(ret); } else if (ret != 0) { dev_put(skb->dev); kfree_skb(skb); aead_request_free(req); macsec_txsa_put(tx_sa); return ERR_PTR(-EINVAL); } dev_put(skb->dev); aead_request_free(req); macsec_txsa_put(tx_sa); return skb; } static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn) { struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats); struct macsec_eth_header *hdr = macsec_ethhdr(skb); u32 lowest_pn = 0; spin_lock(&rx_sa->lock); if (rx_sa->next_pn_halves.lower >= secy->replay_window) lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window; /* Now perform replay protection check again * (see IEEE 802.1AE-2006 figure 10-5) */ if (secy->replay_protect && pn < lowest_pn && (!secy->xpn || pn_same_half(pn, lowest_pn))) { spin_unlock(&rx_sa->lock); u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(secy->netdev, rx_dropped); return false; } if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) { unsigned int msdu_len = macsec_msdu_len(skb); u64_stats_update_begin(&rxsc_stats->syncp); if (hdr->tci_an & MACSEC_TCI_E) rxsc_stats->stats.InOctetsDecrypted += msdu_len; else rxsc_stats->stats.InOctetsValidated += msdu_len; u64_stats_update_end(&rxsc_stats->syncp); } if (!macsec_skb_cb(skb)->valid) { spin_unlock(&rx_sa->lock); /* 10.6.5 */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotValid++; u64_stats_update_end(&rxsc_stats->syncp); this_cpu_inc(rx_sa->stats->InPktsNotValid); DEV_STATS_INC(secy->netdev, rx_errors); return false; } u64_stats_update_begin(&rxsc_stats->syncp); if (secy->validate_frames == MACSEC_VALIDATE_CHECK) { rxsc_stats->stats.InPktsInvalid++; this_cpu_inc(rx_sa->stats->InPktsInvalid); } else if (pn < lowest_pn) { rxsc_stats->stats.InPktsDelayed++; } else { rxsc_stats->stats.InPktsUnchecked++; } u64_stats_update_end(&rxsc_stats->syncp); } else { u64_stats_update_begin(&rxsc_stats->syncp); if (pn < lowest_pn) { rxsc_stats->stats.InPktsDelayed++; } else { rxsc_stats->stats.InPktsOK++; this_cpu_inc(rx_sa->stats->InPktsOK); } u64_stats_update_end(&rxsc_stats->syncp); // Instead of "pn >=" - to support pn overflow in xpn if (pn + 1 > rx_sa->next_pn_halves.lower) { rx_sa->next_pn_halves.lower = pn + 1; } else if (secy->xpn && !pn_same_half(pn, rx_sa->next_pn_halves.lower)) { rx_sa->next_pn_halves.upper++; rx_sa->next_pn_halves.lower = pn + 1; } spin_unlock(&rx_sa->lock); } return true; } static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev) { skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); } static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len) { skb->ip_summed = CHECKSUM_NONE; memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN); skb_pull(skb, hdr_len); pskb_trim_unique(skb, skb->len - icv_len); } static void count_rx(struct net_device *dev, int len) { dev_sw_netstats_rx_add(dev, len); } static void macsec_decrypt_done(void *data, int err) { struct sk_buff *skb = data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; struct macsec_rx_sc *rx_sc = rx_sa->sc; int len; u32 pn; aead_request_free(macsec_skb_cb(skb)->req); if (!err) macsec_skb_cb(skb)->valid = true; rcu_read_lock_bh(); pn = ntohl(macsec_ethhdr(skb)->packet_number); if (!macsec_post_decrypt(skb, &macsec->secy, pn)) { rcu_read_unlock_bh(); kfree_skb(skb); goto out; } macsec_finalize_skb(skb, macsec->secy.icv_len, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); len = skb->len; macsec_reset_skb(skb, macsec->secy.netdev); if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS) count_rx(dev, len); rcu_read_unlock_bh(); out: macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); dev_put(dev); } static struct sk_buff *macsec_decrypt(struct sk_buff *skb, struct net_device *dev, struct macsec_rx_sa *rx_sa, sci_t sci, struct macsec_secy *secy) { int ret; struct scatterlist *sg; struct sk_buff *trailer; unsigned char *iv; struct aead_request *req; struct macsec_eth_header *hdr; u32 hdr_pn; u16 icv_len = secy->icv_len; macsec_skb_cb(skb)->valid = false; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); ret = skb_cow_data(skb, 0, &trailer); if (unlikely(ret < 0)) { kfree_skb(skb); return ERR_PTR(ret); } req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret); if (!req) { kfree_skb(skb); return ERR_PTR(-ENOMEM); } hdr = (struct macsec_eth_header *)skb->data; hdr_pn = ntohl(hdr->packet_number); if (secy->xpn) { pn_t recovered_pn = rx_sa->next_pn_halves; recovered_pn.lower = hdr_pn; if (hdr_pn < rx_sa->next_pn_halves.lower && !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower)) recovered_pn.upper++; macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64, rx_sa->key.salt); } else { macsec_fill_iv(iv, sci, hdr_pn); } sg_init_table(sg, ret); ret = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(ret < 0)) { aead_request_free(req); kfree_skb(skb); return ERR_PTR(ret); } if (hdr->tci_an & MACSEC_TCI_E) { /* confidentiality: ethernet + macsec header * authenticated, encrypted payload */ int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci); aead_request_set_crypt(req, sg, sg, len, iv); aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci)); skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { aead_request_free(req); return ERR_PTR(-ENOMEM); } } else { /* integrity only: all headers + data authenticated */ aead_request_set_crypt(req, sg, sg, icv_len, iv); aead_request_set_ad(req, skb->len - icv_len); } macsec_skb_cb(skb)->req = req; skb->dev = dev; aead_request_set_callback(req, 0, macsec_decrypt_done, skb); dev_hold(dev); ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) { return ERR_PTR(ret); } else if (ret != 0) { /* decryption/authentication failed * 10.6 if validateFrames is disabled, deliver anyway */ if (ret != -EBADMSG) { kfree_skb(skb); skb = ERR_PTR(ret); } } else { macsec_skb_cb(skb)->valid = true; } dev_put(dev); aead_request_free(req); return skb; } static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc; for_each_rxsc(secy, rx_sc) { if (rx_sc->sci == sci) return rx_sc; } return NULL; } static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc; for_each_rxsc_rtnl(secy, rx_sc) { if (rx_sc->sci == sci) return rx_sc; } return NULL; } static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) { /* Deliver to the uncontrolled port by default */ enum rx_handler_result ret = RX_HANDLER_PASS; struct ethhdr *hdr = eth_hdr(skb); struct metadata_dst *md_dst; struct macsec_rxh_data *rxd; struct macsec_dev *macsec; bool is_macsec_md_dst; rcu_read_lock(); rxd = macsec_data_rcu(skb->dev); md_dst = skb_metadata_dst(skb); is_macsec_md_dst = md_dst && md_dst->type == METADATA_MACSEC; list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct sk_buff *nskb; struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); struct net_device *ndev = macsec->secy.netdev; /* If h/w offloading is enabled, HW decodes frames and strips * the SecTAG, so we have to deduce which port to deliver to. */ if (macsec_is_offloaded(macsec) && netif_running(ndev)) { const struct macsec_ops *ops; ops = macsec_get_ops(macsec, NULL); if (ops->rx_uses_md_dst && !is_macsec_md_dst) continue; if (is_macsec_md_dst) { struct macsec_rx_sc *rx_sc; /* All drivers that implement MACsec offload * support using skb metadata destinations must * indicate that they do so. */ DEBUG_NET_WARN_ON_ONCE(!ops->rx_uses_md_dst); rx_sc = find_rx_sc(&macsec->secy, md_dst->u.macsec_info.sci); if (!rx_sc) continue; /* device indicated macsec offload occurred */ skb->dev = ndev; skb->pkt_type = PACKET_HOST; eth_skb_pkt_type(skb, ndev); ret = RX_HANDLER_ANOTHER; goto out; } /* This datapath is insecure because it is unable to * enforce isolation of broadcast/multicast traffic and * unicast traffic with promiscuous mode on the macsec * netdev. Since the core stack has no mechanism to * check that the hardware did indeed receive MACsec * traffic, it is possible that the response handling * done by the MACsec port was to a plaintext packet. * This violates the MACsec protocol standard. */ if (ether_addr_equal_64bits(hdr->h_dest, ndev->dev_addr)) { /* exact match, divert skb to this port */ skb->dev = ndev; skb->pkt_type = PACKET_HOST; ret = RX_HANDLER_ANOTHER; goto out; } else if (is_multicast_ether_addr_64bits( hdr->h_dest)) { /* multicast frame, deliver on this port too */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; nskb->dev = ndev; eth_skb_pkt_type(nskb, ndev); __netif_rx(nskb); } else if (ndev->flags & IFF_PROMISC) { skb->dev = ndev; skb->pkt_type = PACKET_HOST; ret = RX_HANDLER_ANOTHER; goto out; } continue; } /* 10.6 If the management control validateFrames is not * Strict, frames without a SecTAG are received, counted, and * delivered to the Controlled Port */ if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoTag++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_dropped); continue; } /* deliver on this port */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; nskb->dev = ndev; if (__netif_rx(nskb) == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); } } out: rcu_read_unlock(); return ret; } static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct macsec_eth_header *hdr; struct macsec_secy *secy = NULL; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; struct macsec_rxh_data *rxd; struct macsec_dev *macsec; unsigned int len; sci_t sci; u32 hdr_pn; bool cbit; struct pcpu_rx_sc_stats *rxsc_stats; struct pcpu_secy_stats *secy_stats; bool pulled_sci; int ret; if (skb_headroom(skb) < ETH_HLEN) goto drop_direct; hdr = macsec_ethhdr(skb); if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) return handle_not_macsec(skb); skb = skb_unshare(skb, GFP_ATOMIC); *pskb = skb; if (!skb) return RX_HANDLER_CONSUMED; pulled_sci = pskb_may_pull(skb, macsec_extra_len(true)); if (!pulled_sci) { if (!pskb_may_pull(skb, macsec_extra_len(false))) goto drop_direct; } hdr = macsec_ethhdr(skb); /* Frames with a SecTAG that has the TCI E bit set but the C * bit clear are discarded, as this reserved encoding is used * to identify frames with a SecTAG that are not to be * delivered to the Controlled Port. */ if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E) return RX_HANDLER_PASS; /* now, pull the extra length */ if (hdr->tci_an & MACSEC_TCI_SC) { if (!pulled_sci) goto drop_direct; } /* ethernet header is part of crypto processing */ skb_push(skb, ETH_HLEN); macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC); macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK; sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci); rcu_read_lock(); rxd = macsec_data_rcu(skb->dev); list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci); sc = sc ? macsec_rxsc_get(sc) : NULL; if (sc) { secy = &macsec->secy; rx_sc = sc; break; } } if (!secy) goto nosci; dev = secy->netdev; macsec = macsec_priv(dev); secy_stats = this_cpu_ptr(macsec->stats); rxsc_stats = this_cpu_ptr(rx_sc->stats); if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsBadTag++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(secy->netdev, rx_errors); goto drop_nosa; } rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]); if (!rx_sa) { /* 10.6.1 if the SA is not in use */ /* If validateFrames is Strict or the C bit in the * SecTAG is set, discard */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotUsingSA++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(secy->netdev, rx_errors); goto drop_nosa; } /* not Strict, the frame (with the SecTAG and ICV * removed) is delivered to the Controlled Port. */ u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsUnusedSA++; u64_stats_update_end(&rxsc_stats->syncp); goto deliver; } /* First, PN check to avoid decrypting obviously wrong packets */ hdr_pn = ntohl(hdr->packet_number); if (secy->replay_protect) { bool late; spin_lock(&rx_sa->lock); late = rx_sa->next_pn_halves.lower >= secy->replay_window && hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window); if (secy->xpn) late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn); spin_unlock(&rx_sa->lock); if (late) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_dropped); goto drop; } } macsec_skb_cb(skb)->rx_sa = rx_sa; /* Disabled && !changed text => skip validation */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames != MACSEC_VALIDATE_DISABLED) skb = macsec_decrypt(skb, dev, rx_sa, sci, secy); if (IS_ERR(skb)) { /* the decrypt callback needs the reference */ if (PTR_ERR(skb) != -EINPROGRESS) { macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); } rcu_read_unlock(); *pskb = NULL; return RX_HANDLER_CONSUMED; } if (!macsec_post_decrypt(skb, secy, hdr_pn)) goto drop; deliver: macsec_finalize_skb(skb, secy->icv_len, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); len = skb->len; macsec_reset_skb(skb, secy->netdev); if (rx_sa) macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); skb_orphan(skb); ret = gro_cells_receive(&macsec->gro_cells, skb); if (ret == NET_RX_SUCCESS) count_rx(dev, len); else DEV_STATS_INC(macsec->secy.netdev, rx_dropped); rcu_read_unlock(); *pskb = NULL; return RX_HANDLER_CONSUMED; drop: macsec_rxsa_put(rx_sa); drop_nosa: macsec_rxsc_put(rx_sc); rcu_read_unlock(); drop_direct: kfree_skb(skb); *pskb = NULL; return RX_HANDLER_CONSUMED; nosci: /* 10.6.1 if the SC is not found */ cbit = !!(hdr->tci_an & MACSEC_TCI_C); if (!cbit) macsec_finalize_skb(skb, MACSEC_DEFAULT_ICV_LEN, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct sk_buff *nskb; secy_stats = this_cpu_ptr(macsec->stats); /* If validateFrames is Strict or the C bit in the * SecTAG is set, discard */ if (cbit || macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoSCI++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_errors); continue; } /* not strict, the frame (with the SecTAG and ICV * removed) is delivered to the Controlled Port. */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; macsec_reset_skb(nskb, macsec->secy.netdev); ret = __netif_rx(nskb); if (ret == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUnknownSCI++; u64_stats_update_end(&secy_stats->syncp); } else { DEV_STATS_INC(macsec->secy.netdev, rx_dropped); } } rcu_read_unlock(); *pskb = skb; return RX_HANDLER_PASS; } static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len) { struct crypto_aead *tfm; int ret; tfm = crypto_alloc_aead("gcm(aes)", 0, 0); if (IS_ERR(tfm)) return tfm; ret = crypto_aead_setkey(tfm, key, key_len); if (ret < 0) goto fail; ret = crypto_aead_setauthsize(tfm, icv_len); if (ret < 0) goto fail; return tfm; fail: crypto_free_aead(tfm); return ERR_PTR(ret); } static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, int icv_len) { rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats); if (!rx_sa->stats) return -ENOMEM; rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); if (IS_ERR(rx_sa->key.tfm)) { free_percpu(rx_sa->stats); return PTR_ERR(rx_sa->key.tfm); } rx_sa->ssci = MACSEC_UNDEF_SSCI; rx_sa->active = false; rx_sa->next_pn = 1; refcount_set(&rx_sa->refcnt, 1); spin_lock_init(&rx_sa->lock); return 0; } static void clear_rx_sa(struct macsec_rx_sa *rx_sa) { rx_sa->active = false; macsec_rxsa_put(rx_sa); } static void free_rx_sc(struct macsec_rx_sc *rx_sc) { int i; for (i = 0; i < MACSEC_NUM_AN; i++) { struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]); RCU_INIT_POINTER(rx_sc->sa[i], NULL); if (sa) clear_rx_sa(sa); } macsec_rxsc_put(rx_sc); } static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc, __rcu **rx_scp; for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp); rx_sc; rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) { if (rx_sc->sci == sci) { if (rx_sc->active) secy->n_rx_sc--; rcu_assign_pointer(*rx_scp, rx_sc->next); return rx_sc; } } return NULL; } static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci, bool active) { struct macsec_rx_sc *rx_sc; struct macsec_dev *macsec; struct net_device *real_dev = macsec_priv(dev)->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); struct macsec_secy *secy; list_for_each_entry(macsec, &rxd->secys, secys) { if (find_rx_sc_rtnl(&macsec->secy, sci)) return ERR_PTR(-EEXIST); } rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL); if (!rx_sc) return ERR_PTR(-ENOMEM); rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats); if (!rx_sc->stats) { kfree(rx_sc); return ERR_PTR(-ENOMEM); } rx_sc->sci = sci; rx_sc->active = active; refcount_set(&rx_sc->refcnt, 1); secy = &macsec_priv(dev)->secy; rcu_assign_pointer(rx_sc->next, secy->rx_sc); rcu_assign_pointer(secy->rx_sc, rx_sc); if (rx_sc->active) secy->n_rx_sc++; return rx_sc; } static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, int icv_len) { tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats); if (!tx_sa->stats) return -ENOMEM; tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); if (IS_ERR(tx_sa->key.tfm)) { free_percpu(tx_sa->stats); return PTR_ERR(tx_sa->key.tfm); } tx_sa->ssci = MACSEC_UNDEF_SSCI; tx_sa->active = false; refcount_set(&tx_sa->refcnt, 1); spin_lock_init(&tx_sa->lock); return 0; } static void clear_tx_sa(struct macsec_tx_sa *tx_sa) { tx_sa->active = false; macsec_txsa_put(tx_sa); } static struct genl_family macsec_fam; static struct net_device *get_dev_from_nl(struct net *net, struct nlattr **attrs) { int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]); struct net_device *dev; dev = __dev_get_by_index(net, ifindex); if (!dev) return ERR_PTR(-ENODEV); if (!netif_is_macsec(dev)) return ERR_PTR(-ENODEV); return dev; } static enum macsec_offload nla_get_offload(const struct nlattr *nla) { return (__force enum macsec_offload)nla_get_u8(nla); } static sci_t nla_get_sci(const struct nlattr *nla) { return (__force sci_t)nla_get_u64(nla); } static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value, int padattr) { return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr); } static ssci_t nla_get_ssci(const struct nlattr *nla) { return (__force ssci_t)nla_get_u32(nla); } static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value) { return nla_put_u32(skb, attrtype, (__force u64)value); } static struct macsec_tx_sa *get_txsa_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_sa, struct net_device **devp, struct macsec_secy **secyp, struct macsec_tx_sc **scp, u8 *assoc_num) { struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; if (!tb_sa[MACSEC_SA_ATTR_AN]) return ERR_PTR(-EINVAL); *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); dev = get_dev_from_nl(net, attrs); if (IS_ERR(dev)) return ERR_CAST(dev); if (*assoc_num >= MACSEC_NUM_AN) return ERR_PTR(-EINVAL); secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]); if (!tx_sa) return ERR_PTR(-ENODEV); *devp = dev; *scp = tx_sc; *secyp = secy; return tx_sa; } static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_rxsc, struct net_device **devp, struct macsec_secy **secyp) { struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; sci_t sci; dev = get_dev_from_nl(net, attrs); if (IS_ERR(dev)) return ERR_CAST(dev); secy = &macsec_priv(dev)->secy; if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) return ERR_PTR(-EINVAL); sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); rx_sc = find_rx_sc_rtnl(secy, sci); if (!rx_sc) return ERR_PTR(-ENODEV); *secyp = secy; *devp = dev; return rx_sc; } static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_rxsc, struct nlattr **tb_sa, struct net_device **devp, struct macsec_secy **secyp, struct macsec_rx_sc **scp, u8 *assoc_num) { struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; if (!tb_sa[MACSEC_SA_ATTR_AN]) return ERR_PTR(-EINVAL); *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (*assoc_num >= MACSEC_NUM_AN) return ERR_PTR(-EINVAL); rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp); if (IS_ERR(rx_sc)) return ERR_CAST(rx_sc); rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]); if (!rx_sa) return ERR_PTR(-ENODEV); *scp = rx_sc; return rx_sa; } static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = { [MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 }, [MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED }, [MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED }, [MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED }, }; static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = { [MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 }, [MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 }, }; static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { [MACSEC_SA_ATTR_AN] = { .type = NLA_U8 }, [MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 }, [MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4), [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY, .len = MACSEC_KEYID_LEN, }, [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY, .len = MACSEC_MAX_KEY_LEN, }, [MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 }, [MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY, .len = MACSEC_SALT_LEN, }, }; static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = { [MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 }, }; /* Offloads an operation to a device driver */ static int macsec_offload(int (* const func)(struct macsec_context *), struct macsec_context *ctx) { int ret; if (unlikely(!func)) return 0; if (ctx->offload == MACSEC_OFFLOAD_PHY) mutex_lock(&ctx->phydev->lock); ret = (*func)(ctx); if (ctx->offload == MACSEC_OFFLOAD_PHY) mutex_unlock(&ctx->phydev->lock); return ret; } static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) { if (!attrs[MACSEC_ATTR_SA_CONFIG]) return -EINVAL; if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL)) return -EINVAL; return 0; } static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc) { if (!attrs[MACSEC_ATTR_RXSC_CONFIG]) return -EINVAL; if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL)) return -EINVAL; return 0; } static bool validate_add_rxsa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || !attrs[MACSEC_SA_ATTR_KEY] || !attrs[MACSEC_SA_ATTR_KEYID]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) return false; return true; } static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct nlattr **attrs = info->attrs; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; unsigned char assoc_num; int pn_len; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int err; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsa(tb_sa)) return -EINVAL; rtnl_lock(); rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); rtnl_unlock(); return -EINVAL; } pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (tb_sa[MACSEC_SA_ATTR_PN] && nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } if (secy->xpn) { if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { rtnl_unlock(); return -EINVAL; } if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } } rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]); if (rx_sa) { rtnl_unlock(); return -EBUSY; } rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL); if (!rx_sa) { rtnl_unlock(); return -ENOMEM; } err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len, secy->icv_len); if (err < 0) { kfree(rx_sa); rtnl_unlock(); return err; } if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&rx_sa->lock); rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&rx_sa->lock); } if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); rx_sa->sc = rx_sc; if (secy->xpn) { rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], MACSEC_SALT_LEN); } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); err = macsec_offload(ops->mdo_add_rxsa, &ctx); memzero_explicit(ctx.sa.key, secy->key_len); if (err) goto cleanup; } nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa); rtnl_unlock(); return 0; cleanup: macsec_rxsa_put(rx_sa); rtnl_unlock(); return err; } static bool validate_add_rxsc(struct nlattr **attrs) { if (!attrs[MACSEC_RXSC_ATTR_SCI]) return false; if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1) return false; } return true; } static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; sci_t sci = MACSEC_UNDEF_SCI; struct nlattr **attrs = info->attrs; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct macsec_secy *secy; bool active = true; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsc(tb_rxsc)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); rx_sc = create_rx_sc(dev, sci, active); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_add_rxsc, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: del_rx_sc(secy, sci); free_rx_sc(rx_sc); rtnl_unlock(); return ret; } static bool validate_add_txsa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || !attrs[MACSEC_SA_ATTR_PN] || !attrs[MACSEC_SA_ATTR_KEY] || !attrs[MACSEC_SA_ATTR_KEYID]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) return false; return true; } static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct nlattr **attrs = info->attrs; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; unsigned char assoc_num; int pn_len; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_operational; int err; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_add_txsa(tb_sa)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); rtnl_unlock(); return -EINVAL; } pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } if (secy->xpn) { if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { rtnl_unlock(); return -EINVAL; } if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } } tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]); if (tx_sa) { rtnl_unlock(); return -EBUSY; } tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL); if (!tx_sa) { rtnl_unlock(); return -ENOMEM; } err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len, secy->icv_len); if (err < 0) { kfree(tx_sa); rtnl_unlock(); return err; } spin_lock_bh(&tx_sa->lock); tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa && tx_sa->active) secy->operational = true; if (secy->xpn) { tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], MACSEC_SALT_LEN); } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); err = macsec_offload(ops->mdo_add_txsa, &ctx); memzero_explicit(ctx.sa.key, secy->key_len); if (err) goto cleanup; } nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa); rtnl_unlock(); return 0; cleanup: secy->operational = was_operational; macsec_txsa_put(tx_sa); rtnl_unlock(); return err; } static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; rtnl_lock(); rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, &dev, &secy, &rx_sc, &assoc_num); if (IS_ERR(rx_sa)) { rtnl_unlock(); return PTR_ERR(rx_sa); } if (rx_sa->active) { rtnl_unlock(); return -EBUSY; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_rxsa, &ctx); if (ret) goto cleanup; } RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL); clear_rx_sa(rx_sa); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; sci_t sci; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), info->attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); rx_sc = del_rx_sc(secy, sci); if (!rx_sc) { rtnl_unlock(); return -ENODEV; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_rxsc, &ctx); if (ret) goto cleanup; } free_rx_sc(rx_sc); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; rtnl_lock(); tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, &dev, &secy, &tx_sc, &assoc_num); if (IS_ERR(tx_sa)) { rtnl_unlock(); return PTR_ERR(tx_sa); } if (tx_sa->active) { rtnl_unlock(); return -EBUSY; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_txsa, &ctx); if (ret) goto cleanup; } RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL); clear_tx_sa(tx_sa); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static bool validate_upd_sa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || attrs[MACSEC_SA_ATTR_KEY] || attrs[MACSEC_SA_ATTR_KEYID] || attrs[MACSEC_SA_ATTR_SSCI] || attrs[MACSEC_SA_ATTR_SALT]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } return true; } static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_operational, was_active; pn_t prev_pn; int ret = 0; prev_pn.full64 = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_upd_sa(tb_sa)) return -EINVAL; rtnl_lock(); tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, &dev, &secy, &tx_sc, &assoc_num); if (IS_ERR(tx_sa)) { rtnl_unlock(); return PTR_ERR(tx_sa); } if (tb_sa[MACSEC_SA_ATTR_PN]) { int pn_len; pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } spin_lock_bh(&tx_sa->lock); prev_pn = tx_sa->next_pn_halves; tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); } was_active = tx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa) secy->operational = tx_sa->active; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.sa.update_pn = !!prev_pn.full64; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_txsa, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&tx_sa->lock); tx_sa->next_pn_halves = prev_pn; spin_unlock_bh(&tx_sa->lock); } tx_sa->active = was_active; secy->operational = was_operational; rtnl_unlock(); return ret; } static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_active; pn_t prev_pn; int ret = 0; prev_pn.full64 = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_upd_sa(tb_sa)) return -EINVAL; rtnl_lock(); rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, &dev, &secy, &rx_sc, &assoc_num); if (IS_ERR(rx_sa)) { rtnl_unlock(); return PTR_ERR(rx_sa); } if (tb_sa[MACSEC_SA_ATTR_PN]) { int pn_len; pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } spin_lock_bh(&rx_sa->lock); prev_pn = rx_sa->next_pn_halves; rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&rx_sa->lock); } was_active = rx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.sa.update_pn = !!prev_pn.full64; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_rxsa, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&rx_sa->lock); rx_sa->next_pn_halves = prev_pn; spin_unlock_bh(&rx_sa->lock); } rx_sa->active = was_active; rtnl_unlock(); return ret; } static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; unsigned int prev_n_rx_sc; bool was_active; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsc(tb_rxsc)) return -EINVAL; rtnl_lock(); rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } was_active = rx_sc->active; prev_n_rx_sc = secy->n_rx_sc; if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) { bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); if (rx_sc->active != new) secy->n_rx_sc += new ? 1 : -1; rx_sc->active = new; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_rxsc, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: secy->n_rx_sc = prev_n_rx_sc; rx_sc->active = was_active; rtnl_unlock(); return ret; } static bool macsec_is_configured(struct macsec_dev *macsec) { struct macsec_secy *secy = &macsec->secy; struct macsec_tx_sc *tx_sc = &secy->tx_sc; int i; if (secy->rx_sc) return true; for (i = 0; i < MACSEC_NUM_AN; i++) if (tx_sc->sa[i]) return true; return false; } static bool macsec_needs_tx_tag(struct macsec_dev *macsec, const struct macsec_ops *ops) { return macsec->offload == MACSEC_OFFLOAD_PHY && ops->mdo_insert_tx_tag; } static void macsec_set_head_tail_room(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int needed_headroom, needed_tailroom; const struct macsec_ops *ops; ops = macsec_get_ops(macsec, NULL); if (ops) { needed_headroom = ops->needed_headroom; needed_tailroom = ops->needed_tailroom; } else { needed_headroom = MACSEC_NEEDED_HEADROOM; needed_tailroom = MACSEC_NEEDED_TAILROOM; } dev->needed_headroom = real_dev->needed_headroom + needed_headroom; dev->needed_tailroom = real_dev->needed_tailroom + needed_tailroom; } static void macsec_inherit_tso_max(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); /* if macsec is offloaded, we need to follow the lower * device's capabilities. otherwise, we can ignore them. */ if (macsec_is_offloaded(macsec)) netif_inherit_tso_max(dev, macsec->real_dev); } static int macsec_update_offload(struct net_device *dev, enum macsec_offload offload) { enum macsec_offload prev_offload; const struct macsec_ops *ops; struct macsec_context ctx; struct macsec_dev *macsec; int ret = 0; macsec = macsec_priv(dev); /* Check if the offloading mode is supported by the underlying layers */ if (offload != MACSEC_OFFLOAD_OFF && !macsec_check_offload(offload, macsec)) return -EOPNOTSUPP; /* Check if the net device is busy. */ if (netif_running(dev)) return -EBUSY; /* Check if the device already has rules configured: we do not support * rules migration. */ if (macsec_is_configured(macsec)) return -EBUSY; prev_offload = macsec->offload; ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload, macsec, &ctx); if (!ops) return -EOPNOTSUPP; macsec->offload = offload; ctx.secy = &macsec->secy; ret = offload == MACSEC_OFFLOAD_OFF ? macsec_offload(ops->mdo_del_secy, &ctx) : macsec_offload(ops->mdo_add_secy, &ctx); if (ret) { macsec->offload = prev_offload; return ret; } macsec_set_head_tail_room(dev); macsec->insert_tx_tag = macsec_needs_tx_tag(macsec, ops); macsec_inherit_tso_max(dev); netdev_update_features(dev); return ret; } static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1]; struct nlattr **attrs = info->attrs; enum macsec_offload offload; struct macsec_dev *macsec; struct net_device *dev; int ret = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (!attrs[MACSEC_ATTR_OFFLOAD]) return -EINVAL; if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX, attrs[MACSEC_ATTR_OFFLOAD], macsec_genl_offload_policy, NULL)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { ret = PTR_ERR(dev); goto out; } macsec = macsec_priv(dev); if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) { ret = -EINVAL; goto out; } offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); if (macsec->offload != offload) ret = macsec_update_offload(dev, offload); out: rtnl_unlock(); return ret; } static void get_tx_sa_stats(struct net_device *dev, int an, struct macsec_tx_sa *tx_sa, struct macsec_tx_sa_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.sa.assoc_num = an; ctx.sa.tx_sa = tx_sa; ctx.stats.tx_sa_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_tx_sa_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct macsec_tx_sa_stats *stats = per_cpu_ptr(tx_sa->stats, cpu); sum->OutPktsProtected += stats->OutPktsProtected; sum->OutPktsEncrypted += stats->OutPktsEncrypted; } } static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum) { if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED, sum->OutPktsProtected) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED, sum->OutPktsEncrypted)) return -EMSGSIZE; return 0; } static void get_rx_sa_stats(struct net_device *dev, struct macsec_rx_sc *rx_sc, int an, struct macsec_rx_sa *rx_sa, struct macsec_rx_sa_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.sa.assoc_num = an; ctx.sa.rx_sa = rx_sa; ctx.stats.rx_sa_stats = sum; ctx.secy = &macsec_priv(dev)->secy; ctx.rx_sc = rx_sc; macsec_offload(ops->mdo_get_rx_sa_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct macsec_rx_sa_stats *stats = per_cpu_ptr(rx_sa->stats, cpu); sum->InPktsOK += stats->InPktsOK; sum->InPktsInvalid += stats->InPktsInvalid; sum->InPktsNotValid += stats->InPktsNotValid; sum->InPktsNotUsingSA += stats->InPktsNotUsingSA; sum->InPktsUnusedSA += stats->InPktsUnusedSA; } } static int copy_rx_sa_stats(struct sk_buff *skb, struct macsec_rx_sa_stats *sum) { if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID, sum->InPktsInvalid) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID, sum->InPktsNotValid) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA, sum->InPktsNotUsingSA) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA, sum->InPktsUnusedSA)) return -EMSGSIZE; return 0; } static void get_rx_sc_stats(struct net_device *dev, struct macsec_rx_sc *rx_sc, struct macsec_rx_sc_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.rx_sc_stats = sum; ctx.secy = &macsec_priv(dev)->secy; ctx.rx_sc = rx_sc; macsec_offload(ops->mdo_get_rx_sc_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_rx_sc_stats *stats; struct macsec_rx_sc_stats tmp; unsigned int start; stats = per_cpu_ptr(rx_sc->stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry(&stats->syncp, start)); sum->InOctetsValidated += tmp.InOctetsValidated; sum->InOctetsDecrypted += tmp.InOctetsDecrypted; sum->InPktsUnchecked += tmp.InPktsUnchecked; sum->InPktsDelayed += tmp.InPktsDelayed; sum->InPktsOK += tmp.InPktsOK; sum->InPktsInvalid += tmp.InPktsInvalid; sum->InPktsLate += tmp.InPktsLate; sum->InPktsNotValid += tmp.InPktsNotValid; sum->InPktsNotUsingSA += tmp.InPktsNotUsingSA; sum->InPktsUnusedSA += tmp.InPktsUnusedSA; } } static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED, sum->InOctetsValidated, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED, sum->InOctetsDecrypted, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED, sum->InPktsUnchecked, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED, sum->InPktsDelayed, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID, sum->InPktsInvalid, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE, sum->InPktsLate, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID, sum->InPktsNotValid, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA, sum->InPktsNotUsingSA, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA, sum->InPktsUnusedSA, MACSEC_RXSC_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static void get_tx_sc_stats(struct net_device *dev, struct macsec_tx_sc_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.tx_sc_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_tx_sc_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_tx_sc_stats *stats; struct macsec_tx_sc_stats tmp; unsigned int start; stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry(&stats->syncp, start)); sum->OutPktsProtected += tmp.OutPktsProtected; sum->OutPktsEncrypted += tmp.OutPktsEncrypted; sum->OutOctetsProtected += tmp.OutOctetsProtected; sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted; } } static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED, sum->OutPktsProtected, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED, sum->OutPktsEncrypted, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED, sum->OutOctetsProtected, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED, sum->OutOctetsEncrypted, MACSEC_TXSC_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.dev_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_dev_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_secy_stats *stats; struct macsec_dev_stats tmp; unsigned int start; stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry(&stats->syncp, start)); sum->OutPktsUntagged += tmp.OutPktsUntagged; sum->InPktsUntagged += tmp.InPktsUntagged; sum->OutPktsTooLong += tmp.OutPktsTooLong; sum->InPktsNoTag += tmp.InPktsNoTag; sum->InPktsBadTag += tmp.InPktsBadTag; sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI; sum->InPktsNoSCI += tmp.InPktsNoSCI; sum->InPktsOverrun += tmp.InPktsOverrun; } } static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED, sum->OutPktsUntagged, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED, sum->InPktsUntagged, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG, sum->OutPktsTooLong, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG, sum->InPktsNoTag, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG, sum->InPktsBadTag, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI, sum->InPktsUnknownSCI, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI, sum->InPktsNoSCI, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN, sum->InPktsOverrun, MACSEC_SECY_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb) { struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *secy_nest = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY); u64 csid; if (!secy_nest) return 1; switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; break; default: goto cancel; } if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci, MACSEC_SECY_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE, csid, MACSEC_SECY_ATTR_PAD) || nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) || nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) || nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) || nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) || nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) || nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) || nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) || nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) || nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) || nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa)) goto cancel; if (secy->replay_protect) { if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window)) goto cancel; } nla_nest_end(skb, secy_nest); return 0; cancel: nla_nest_cancel(skb, secy_nest); return 1; } static noinline_for_stack int dump_secy(struct macsec_secy *secy, struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct macsec_tx_sc_stats tx_sc_stats = {0, }; struct macsec_tx_sa_stats tx_sa_stats = {0, }; struct macsec_rx_sc_stats rx_sc_stats = {0, }; struct macsec_rx_sa_stats rx_sa_stats = {0, }; struct macsec_dev *macsec = netdev_priv(dev); struct macsec_dev_stats dev_stats = {0, }; struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *txsa_list, *rxsc_list; struct macsec_rx_sc *rx_sc; struct nlattr *attr; void *hdr; int i, j; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD); if (!attr) goto nla_put_failure; if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload)) goto nla_put_failure; nla_nest_end(skb, attr); if (nla_put_secy(secy, skb)) goto nla_put_failure; attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS); if (!attr) goto nla_put_failure; get_tx_sc_stats(dev, &tx_sc_stats); if (copy_tx_sc_stats(skb, &tx_sc_stats)) { nla_nest_cancel(skb, attr); goto nla_put_failure; } nla_nest_end(skb, attr); attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS); if (!attr) goto nla_put_failure; get_secy_stats(dev, &dev_stats); if (copy_secy_stats(skb, &dev_stats)) { nla_nest_cancel(skb, attr); goto nla_put_failure; } nla_nest_end(skb, attr); txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST); if (!txsa_list) goto nla_put_failure; for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) { struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]); struct nlattr *txsa_nest; u64 pn; int pn_len; if (!tx_sa) continue; txsa_nest = nla_nest_start_noflag(skb, j++); if (!txsa_nest) { nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } memset(&tx_sa_stats, 0, sizeof(tx_sa_stats)); get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats); if (copy_tx_sa_stats(skb, &tx_sa_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } nla_nest_end(skb, attr); if (secy->xpn) { pn = tx_sa->next_pn; pn_len = MACSEC_XPN_PN_LEN; } else { pn = tx_sa->next_pn_halves.lower; pn_len = MACSEC_DEFAULT_PN_LEN; } if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) || (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) || nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) { nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } nla_nest_end(skb, txsa_nest); } nla_nest_end(skb, txsa_list); rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST); if (!rxsc_list) goto nla_put_failure; j = 1; for_each_rxsc_rtnl(secy, rx_sc) { int k; struct nlattr *rxsa_list; struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++); if (!rxsc_nest) { nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) || nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci, MACSEC_RXSC_ATTR_PAD)) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } memset(&rx_sc_stats, 0, sizeof(rx_sc_stats)); get_rx_sc_stats(dev, rx_sc, &rx_sc_stats); if (copy_rx_sc_stats(skb, &rx_sc_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, attr); rxsa_list = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_SA_LIST); if (!rxsa_list) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) { struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]); struct nlattr *rxsa_nest; u64 pn; int pn_len; if (!rx_sa) continue; rxsa_nest = nla_nest_start_noflag(skb, k++); if (!rxsa_nest) { nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } memset(&rx_sa_stats, 0, sizeof(rx_sa_stats)); get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats); if (copy_rx_sa_stats(skb, &rx_sa_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, attr); if (secy->xpn) { pn = rx_sa->next_pn; pn_len = MACSEC_XPN_PN_LEN; } else { pn = rx_sa->next_pn_halves.lower; pn_len = MACSEC_DEFAULT_PN_LEN; } if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) || (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) || nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) { nla_nest_cancel(skb, rxsa_nest); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, rxsa_nest); } nla_nest_end(skb, rxsa_list); nla_nest_end(skb, rxsc_nest); } nla_nest_end(skb, rxsc_list); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int macsec_generation = 1; /* protected by RTNL */ static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *dev; int dev_idx, d; dev_idx = cb->args[0]; d = 0; rtnl_lock(); cb->seq = macsec_generation; for_each_netdev(net, dev) { struct macsec_secy *secy; if (d < dev_idx) goto next; if (!netif_is_macsec(dev)) goto next; secy = &macsec_priv(dev)->secy; if (dump_secy(secy, dev, skb, cb) < 0) goto done; next: d++; } done: rtnl_unlock(); cb->args[0] = d; return skb->len; } static const struct genl_small_ops macsec_genl_ops[] = { { .cmd = MACSEC_CMD_GET_TXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = macsec_dump_txsc, }, { .cmd = MACSEC_CMD_ADD_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_ADD_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_ADD_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_OFFLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_offload, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family macsec_fam __ro_after_init = { .name = MACSEC_GENL_NAME, .hdrsize = 0, .version = MACSEC_GENL_VERSION, .maxattr = MACSEC_ATTR_MAX, .policy = macsec_genl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = macsec_genl_ops, .n_small_ops = ARRAY_SIZE(macsec_genl_ops), .resv_start_op = MACSEC_CMD_UPD_OFFLOAD + 1, }; static struct sk_buff *macsec_insert_tx_tag(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); const struct macsec_ops *ops; struct phy_device *phydev; struct macsec_context ctx; int skb_final_len; int err; ops = macsec_get_ops(macsec, &ctx); skb_final_len = skb->len - ETH_HLEN + ops->needed_headroom + ops->needed_tailroom; if (unlikely(skb_final_len > macsec->real_dev->mtu)) { err = -EINVAL; goto cleanup; } phydev = macsec->real_dev->phydev; err = skb_ensure_writable_head_tail(skb, dev); if (unlikely(err < 0)) goto cleanup; err = ops->mdo_insert_tx_tag(phydev, skb); if (unlikely(err)) goto cleanup; return skb; cleanup: kfree_skb(skb); return ERR_PTR(err); } static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = netdev_priv(dev); struct macsec_secy *secy = &macsec->secy; struct pcpu_secy_stats *secy_stats; int ret, len; if (macsec_is_offloaded(netdev_priv(dev))) { struct metadata_dst *md_dst = secy->tx_sc.md_dst; skb_dst_drop(skb); dst_hold(&md_dst->dst); skb_dst_set(skb, &md_dst->dst); if (macsec->insert_tx_tag) { skb = macsec_insert_tx_tag(skb, dev); if (IS_ERR(skb)) { DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } } skb->dev = macsec->real_dev; return dev_queue_xmit(skb); } /* 10.5 */ if (!secy->protect_frames) { secy_stats = this_cpu_ptr(macsec->stats); u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.OutPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); skb->dev = macsec->real_dev; len = skb->len; ret = dev_queue_xmit(skb); count_tx(dev, ret, len); return ret; } if (!secy->operational) { kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } len = skb->len; skb = macsec_encrypt(skb, dev); if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EINPROGRESS) DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); macsec_encrypt_finish(skb, dev); ret = dev_queue_xmit(skb); count_tx(dev, ret, len); return ret; } #define MACSEC_FEATURES \ (NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST) #define MACSEC_OFFLOAD_FEATURES \ (MACSEC_FEATURES | NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES | \ NETIF_F_LRO | NETIF_F_RXHASH | NETIF_F_CSUM_MASK | NETIF_F_RXCSUM) static int macsec_dev_init(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int err; err = gro_cells_init(&macsec->gro_cells, dev); if (err) return err; macsec_inherit_tso_max(dev); dev->hw_features = real_dev->hw_features & MACSEC_OFFLOAD_FEATURES; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->features = real_dev->features & MACSEC_OFFLOAD_FEATURES; dev->features |= NETIF_F_GSO_SOFTWARE; dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; macsec_set_head_tail_room(dev); if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, real_dev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); /* Get macsec's reference to real_dev */ netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL); return 0; } static void macsec_dev_uninit(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); gro_cells_destroy(&macsec->gro_cells); } static netdev_features_t macsec_fix_features(struct net_device *dev, netdev_features_t features) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; netdev_features_t mask; mask = macsec_is_offloaded(macsec) ? MACSEC_OFFLOAD_FEATURES : MACSEC_FEATURES; features &= (real_dev->features & mask) | NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES; return features; } static int macsec_dev_open(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int err; err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) return err; if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(real_dev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(real_dev, 1); if (err < 0) goto clear_allmulti; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto clear_allmulti; } ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_dev_open, &ctx); if (err) goto clear_allmulti; } if (netif_carrier_ok(real_dev)) netif_carrier_on(dev); return 0; clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); del_unicast: dev_uc_del(real_dev, dev->dev_addr); netif_carrier_off(dev); return err; } static int macsec_dev_stop(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; netif_carrier_off(dev); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.secy = &macsec->secy; macsec_offload(ops->mdo_dev_stop, &ctx); } } dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(real_dev, -1); dev_uc_del(real_dev, dev->dev_addr); return 0; } static void macsec_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = macsec_priv(dev)->real_dev; if (!(dev->flags & IFF_UP)) return; if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } static void macsec_dev_set_rx_mode(struct net_device *dev) { struct net_device *real_dev = macsec_priv(dev)->real_dev; dev_mc_sync(real_dev, dev); dev_uc_sync(real_dev, dev); } static int macsec_set_mac_address(struct net_device *dev, void *p) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (dev->flags & IFF_UP) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; } ether_addr_copy(old_addr, dev->dev_addr); eth_hw_addr_set(dev, addr->sa_data); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (!ops) { err = -EOPNOTSUPP; goto restore_old_addr; } ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_upd_secy, &ctx); if (err) goto restore_old_addr; } if (dev->flags & IFF_UP) dev_uc_del(real_dev, old_addr); return 0; restore_old_addr: if (dev->flags & IFF_UP) dev_uc_del(real_dev, addr->sa_data); eth_hw_addr_set(dev, old_addr); return err; } static int macsec_change_mtu(struct net_device *dev, int new_mtu) { struct macsec_dev *macsec = macsec_priv(dev); unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true); if (macsec->real_dev->mtu - extra < new_mtu) return -ERANGE; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static void macsec_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { if (!dev->tstats) return; dev_fetch_sw_netstats(s, dev->tstats); s->rx_dropped = DEV_STATS_READ(dev, rx_dropped); s->tx_dropped = DEV_STATS_READ(dev, tx_dropped); s->rx_errors = DEV_STATS_READ(dev, rx_errors); } static int macsec_get_iflink(const struct net_device *dev) { return READ_ONCE(macsec_priv(dev)->real_dev->ifindex); } static const struct net_device_ops macsec_netdev_ops = { .ndo_init = macsec_dev_init, .ndo_uninit = macsec_dev_uninit, .ndo_open = macsec_dev_open, .ndo_stop = macsec_dev_stop, .ndo_fix_features = macsec_fix_features, .ndo_change_mtu = macsec_change_mtu, .ndo_set_rx_mode = macsec_dev_set_rx_mode, .ndo_change_rx_flags = macsec_dev_change_rx_flags, .ndo_set_mac_address = macsec_set_mac_address, .ndo_start_xmit = macsec_start_xmit, .ndo_get_stats64 = macsec_get_stats64, .ndo_get_iflink = macsec_get_iflink, }; static const struct device_type macsec_type = { .name = "macsec", }; static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { [IFLA_MACSEC_SCI] = { .type = NLA_U64 }, [IFLA_MACSEC_PORT] = { .type = NLA_U16 }, [IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 }, [IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 }, [IFLA_MACSEC_WINDOW] = { .type = NLA_U32 }, [IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 }, [IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 }, [IFLA_MACSEC_PROTECT] = { .type = NLA_U8 }, [IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 }, [IFLA_MACSEC_ES] = { .type = NLA_U8 }, [IFLA_MACSEC_SCB] = { .type = NLA_U8 }, [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 }, [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 }, [IFLA_MACSEC_OFFLOAD] = { .type = NLA_U8 }, }; static void macsec_free_netdev(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); dst_release(&macsec->secy.tx_sc.md_dst->dst); free_percpu(macsec->stats); free_percpu(macsec->secy.tx_sc.stats); /* Get rid of the macsec's reference to real_dev */ netdev_put(macsec->real_dev, &macsec->dev_tracker); } static void macsec_setup(struct net_device *dev) { ether_setup(dev); dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; dev->priv_flags |= IFF_NO_QUEUE; dev->netdev_ops = &macsec_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macsec_free_netdev; SET_NETDEV_DEVTYPE(dev, &macsec_type); eth_zero_addr(dev->broadcast); } static int macsec_changelink_common(struct net_device *dev, struct nlattr *data[]) { struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; if (data[IFLA_MACSEC_ENCODING_SA]) { struct macsec_tx_sa *tx_sa; tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]); tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]); secy->operational = tx_sa && tx_sa->active; } if (data[IFLA_MACSEC_ENCRYPT]) tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]); if (data[IFLA_MACSEC_PROTECT]) secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]); if (data[IFLA_MACSEC_INC_SCI]) tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); if (data[IFLA_MACSEC_ES]) tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]); if (data[IFLA_MACSEC_SCB]) tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]); if (data[IFLA_MACSEC_REPLAY_PROTECT]) secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]); if (data[IFLA_MACSEC_VALIDATION]) secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]); if (data[IFLA_MACSEC_CIPHER_SUITE]) { switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) { case MACSEC_CIPHER_ID_GCM_AES_128: case MACSEC_DEFAULT_CIPHER_ID: secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; secy->xpn = false; break; case MACSEC_CIPHER_ID_GCM_AES_256: secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; secy->xpn = false; break; case MACSEC_CIPHER_ID_GCM_AES_XPN_128: secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; secy->xpn = true; break; case MACSEC_CIPHER_ID_GCM_AES_XPN_256: secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; secy->xpn = true; break; default: return -EINVAL; } } if (data[IFLA_MACSEC_WINDOW]) { secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]); /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window * for XPN cipher suites */ if (secy->xpn && secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW) return -EINVAL; } return 0; } static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macsec_dev *macsec = macsec_priv(dev); bool macsec_offload_state_change = false; enum macsec_offload offload; struct macsec_tx_sc tx_sc; struct macsec_secy secy; int ret; if (!data) return 0; if (data[IFLA_MACSEC_CIPHER_SUITE] || data[IFLA_MACSEC_ICV_LEN] || data[IFLA_MACSEC_SCI] || data[IFLA_MACSEC_PORT]) return -EINVAL; /* Keep a copy of unmodified secy and tx_sc, in case the offload * propagation fails, to revert macsec_changelink_common. */ memcpy(&secy, &macsec->secy, sizeof(secy)); memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc)); ret = macsec_changelink_common(dev, data); if (ret) goto cleanup; if (data[IFLA_MACSEC_OFFLOAD]) { offload = nla_get_u8(data[IFLA_MACSEC_OFFLOAD]); if (macsec->offload != offload) { macsec_offload_state_change = true; ret = macsec_update_offload(dev, offload); if (ret) goto cleanup; } } /* If h/w offloading is available, propagate to the device */ if (!macsec_offload_state_change && macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.secy = &macsec->secy; ret = macsec_offload(ops->mdo_upd_secy, &ctx); if (ret) goto cleanup; } return 0; cleanup: memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc)); memcpy(&macsec->secy, &secy, sizeof(secy)); return ret; } static void macsec_del_dev(struct macsec_dev *macsec) { int i; while (macsec->secy.rx_sc) { struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc); rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next); free_rx_sc(rx_sc); } for (i = 0; i < MACSEC_NUM_AN; i++) { struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]); if (sa) { RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL); clear_tx_sa(sa); } } } static void macsec_common_dellink(struct net_device *dev, struct list_head *head) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (ops) { ctx.secy = &macsec->secy; macsec_offload(ops->mdo_del_secy, &ctx); } } unregister_netdevice_queue(dev, head); list_del_rcu(&macsec->secys); macsec_del_dev(macsec); netdev_upper_dev_unlink(real_dev, dev); macsec_generation++; } static void macsec_dellink(struct net_device *dev, struct list_head *head) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); macsec_common_dellink(dev, head); if (list_empty(&rxd->secys)) { netdev_rx_handler_unregister(real_dev); kfree(rxd); } } static int register_macsec_dev(struct net_device *real_dev, struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); if (!rxd) { int err; rxd = kmalloc(sizeof(*rxd), GFP_KERNEL); if (!rxd) return -ENOMEM; INIT_LIST_HEAD(&rxd->secys); err = netdev_rx_handler_register(real_dev, macsec_handle_frame, rxd); if (err < 0) { kfree(rxd); return err; } } list_add_tail_rcu(&macsec->secys, &rxd->secys); return 0; } static bool sci_exists(struct net_device *dev, sci_t sci) { struct macsec_rxh_data *rxd = macsec_data_rtnl(dev); struct macsec_dev *macsec; list_for_each_entry(macsec, &rxd->secys, secys) { if (macsec->secy.sci == sci) return true; } return false; } static sci_t dev_to_sci(struct net_device *dev, __be16 port) { return make_sci(dev->dev_addr, port); } static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) { struct macsec_dev *macsec = macsec_priv(dev); struct macsec_secy *secy = &macsec->secy; macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats); if (!macsec->stats) return -ENOMEM; secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats); if (!secy->tx_sc.stats) return -ENOMEM; secy->tx_sc.md_dst = metadata_dst_alloc(0, METADATA_MACSEC, GFP_KERNEL); if (!secy->tx_sc.md_dst) /* macsec and secy percpu stats will be freed when unregistering * net_device in macsec_free_netdev() */ return -ENOMEM; if (sci == MACSEC_UNDEF_SCI) sci = dev_to_sci(dev, MACSEC_PORT_ES); secy->netdev = dev; secy->operational = true; secy->key_len = DEFAULT_SAK_LEN; secy->icv_len = icv_len; secy->validate_frames = MACSEC_VALIDATE_DEFAULT; secy->protect_frames = true; secy->replay_protect = false; secy->xpn = DEFAULT_XPN; secy->sci = sci; secy->tx_sc.md_dst->u.macsec_info.sci = sci; secy->tx_sc.active = true; secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA; secy->tx_sc.encrypt = DEFAULT_ENCRYPT; secy->tx_sc.send_sci = DEFAULT_SEND_SCI; secy->tx_sc.end_station = false; secy->tx_sc.scb = false; return 0; } static struct lock_class_key macsec_netdev_addr_lock_key; static int macsec_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct macsec_dev *macsec = macsec_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; rx_handler_func_t *rx_handler; u8 icv_len = MACSEC_DEFAULT_ICV_LEN; struct net_device *real_dev; int err, mtu; sci_t sci; if (!tb[IFLA_LINK]) return -EINVAL; real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) return -ENODEV; if (real_dev->type != ARPHRD_ETHER) return -EINVAL; dev->priv_flags |= IFF_MACSEC; macsec->real_dev = real_dev; if (data && data[IFLA_MACSEC_OFFLOAD]) macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]); else /* MACsec offloading is off by default */ macsec->offload = MACSEC_OFFLOAD_OFF; /* Check if the offloading mode is supported by the underlying layers */ if (macsec->offload != MACSEC_OFFLOAD_OFF && !macsec_check_offload(macsec->offload, macsec)) return -EOPNOTSUPP; /* send_sci must be set to true when transmit sci explicitly is set */ if ((data && data[IFLA_MACSEC_SCI]) && (data && data[IFLA_MACSEC_INC_SCI])) { u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); if (!send_sci) return -EINVAL; } if (data && data[IFLA_MACSEC_ICV_LEN]) icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); mtu = real_dev->mtu - icv_len - macsec_extra_len(true); if (mtu < 0) dev->mtu = 0; else dev->mtu = mtu; rx_handler = rtnl_dereference(real_dev->rx_handler); if (rx_handler && rx_handler != macsec_handle_frame) return -EBUSY; err = register_netdevice(dev); if (err < 0) return err; netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macsec_netdev_addr_lock_key); err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; /* need to be already registered so that ->init has run and * the MAC addr is set */ if (data && data[IFLA_MACSEC_SCI]) sci = nla_get_sci(data[IFLA_MACSEC_SCI]); else if (data && data[IFLA_MACSEC_PORT]) sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT])); else sci = dev_to_sci(dev, MACSEC_PORT_ES); if (rx_handler && sci_exists(real_dev, sci)) { err = -EBUSY; goto unlink; } err = macsec_add_dev(dev, sci, icv_len); if (err) goto unlink; if (data) { err = macsec_changelink_common(dev, data); if (err) goto del_dev; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_add_secy, &ctx); if (err) goto del_dev; macsec->insert_tx_tag = macsec_needs_tx_tag(macsec, ops); } } err = register_macsec_dev(real_dev, dev); if (err < 0) goto del_dev; netif_stacked_transfer_operstate(real_dev, dev); linkwatch_fire_event(dev); macsec_generation++; return 0; del_dev: macsec_del_dev(macsec); unlink: netdev_upper_dev_unlink(real_dev, dev); unregister: unregister_netdevice(dev); return err; } static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u64 csid = MACSEC_DEFAULT_CIPHER_ID; u8 icv_len = MACSEC_DEFAULT_ICV_LEN; int flag; bool es, scb, sci; if (!data) return 0; if (data[IFLA_MACSEC_CIPHER_SUITE]) csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]); if (data[IFLA_MACSEC_ICV_LEN]) { icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); if (icv_len != MACSEC_DEFAULT_ICV_LEN) { char dummy_key[DEFAULT_SAK_LEN] = { 0 }; struct crypto_aead *dummy_tfm; dummy_tfm = macsec_alloc_tfm(dummy_key, DEFAULT_SAK_LEN, icv_len); if (IS_ERR(dummy_tfm)) return PTR_ERR(dummy_tfm); crypto_free_aead(dummy_tfm); } } switch (csid) { case MACSEC_CIPHER_ID_GCM_AES_128: case MACSEC_CIPHER_ID_GCM_AES_256: case MACSEC_CIPHER_ID_GCM_AES_XPN_128: case MACSEC_CIPHER_ID_GCM_AES_XPN_256: case MACSEC_DEFAULT_CIPHER_ID: if (icv_len < MACSEC_MIN_ICV_LEN || icv_len > MACSEC_STD_ICV_LEN) return -EINVAL; break; default: return -EINVAL; } if (data[IFLA_MACSEC_ENCODING_SA]) { if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN) return -EINVAL; } for (flag = IFLA_MACSEC_ENCODING_SA + 1; flag < IFLA_MACSEC_VALIDATION; flag++) { if (data[flag]) { if (nla_get_u8(data[flag]) > 1) return -EINVAL; } } es = nla_get_u8_default(data[IFLA_MACSEC_ES], false); sci = nla_get_u8_default(data[IFLA_MACSEC_INC_SCI], false); scb = nla_get_u8_default(data[IFLA_MACSEC_SCB], false); if ((sci && (scb || es)) || (scb && es)) return -EINVAL; if (data[IFLA_MACSEC_VALIDATION] && nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX) return -EINVAL; if ((data[IFLA_MACSEC_REPLAY_PROTECT] && nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) && !data[IFLA_MACSEC_WINDOW]) return -EINVAL; return 0; } static struct net *macsec_get_link_net(const struct net_device *dev) { return dev_net(macsec_priv(dev)->real_dev); } struct net_device *macsec_get_real_dev(const struct net_device *dev) { return macsec_priv(dev)->real_dev; } EXPORT_SYMBOL_GPL(macsec_get_real_dev); bool macsec_netdev_is_offloaded(struct net_device *dev) { return macsec_is_offloaded(macsec_priv(dev)); } EXPORT_SYMBOL_GPL(macsec_netdev_is_offloaded); static size_t macsec_get_size(const struct net_device *dev) { return nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */ nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */ nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */ nla_total_size(4) + /* IFLA_MACSEC_WINDOW */ nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */ nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */ nla_total_size(1) + /* IFLA_MACSEC_PROTECT */ nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */ nla_total_size(1) + /* IFLA_MACSEC_ES */ nla_total_size(1) + /* IFLA_MACSEC_SCB */ nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */ nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */ nla_total_size(1) + /* IFLA_MACSEC_OFFLOAD */ 0; } static int macsec_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macsec_tx_sc *tx_sc; struct macsec_dev *macsec; struct macsec_secy *secy; u64 csid; macsec = macsec_priv(dev); secy = &macsec->secy; tx_sc = &secy->tx_sc; switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; break; default: goto nla_put_failure; } if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) || nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE, csid, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) || nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) || nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) || nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) || nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) || nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) || nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) || nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) || nla_put_u8(skb, IFLA_MACSEC_OFFLOAD, macsec->offload) || 0) goto nla_put_failure; if (secy->replay_protect) { if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops macsec_link_ops __read_mostly = { .kind = "macsec", .priv_size = sizeof(struct macsec_dev), .maxtype = IFLA_MACSEC_MAX, .policy = macsec_rtnl_policy, .setup = macsec_setup, .validate = macsec_validate_attr, .newlink = macsec_newlink, .changelink = macsec_changelink, .dellink = macsec_dellink, .get_size = macsec_get_size, .fill_info = macsec_fill_info, .get_link_net = macsec_get_link_net, }; static bool is_macsec_master(struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame; } static int macsec_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *real_dev = netdev_notifier_info_to_dev(ptr); struct macsec_rxh_data *rxd; struct macsec_dev *m, *n; LIST_HEAD(head); if (!is_macsec_master(real_dev)) return NOTIFY_DONE; rxd = macsec_data_rtnl(real_dev); switch (event) { case NETDEV_DOWN: case NETDEV_UP: case NETDEV_CHANGE: list_for_each_entry_safe(m, n, &rxd->secys, secys) { struct net_device *dev = m->secy.netdev; netif_stacked_transfer_operstate(real_dev, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry_safe(m, n, &rxd->secys, secys) { macsec_common_dellink(m->secy.netdev, &head); } netdev_rx_handler_unregister(real_dev); kfree(rxd); unregister_netdevice_many(&head); break; case NETDEV_CHANGEMTU: list_for_each_entry(m, &rxd->secys, secys) { struct net_device *dev = m->secy.netdev; unsigned int mtu = real_dev->mtu - (m->secy.icv_len + macsec_extra_len(true)); if (dev->mtu > mtu) dev_set_mtu(dev, mtu); } break; case NETDEV_FEAT_CHANGE: list_for_each_entry(m, &rxd->secys, secys) { macsec_inherit_tso_max(m->secy.netdev); netdev_update_features(m->secy.netdev); } break; } return NOTIFY_OK; } static struct notifier_block macsec_notifier = { .notifier_call = macsec_notify, }; static int __init macsec_init(void) { int err; pr_info("MACsec IEEE 802.1AE\n"); err = register_netdevice_notifier(&macsec_notifier); if (err) return err; err = rtnl_link_register(&macsec_link_ops); if (err) goto notifier; err = genl_register_family(&macsec_fam); if (err) goto rtnl; return 0; rtnl: rtnl_link_unregister(&macsec_link_ops); notifier: unregister_netdevice_notifier(&macsec_notifier); return err; } static void __exit macsec_exit(void) { genl_unregister_family(&macsec_fam); rtnl_link_unregister(&macsec_link_ops); unregister_netdevice_notifier(&macsec_notifier); rcu_barrier(); } module_init(macsec_init); module_exit(macsec_exit); MODULE_ALIAS_RTNL_LINK("macsec"); MODULE_ALIAS_GENL_FAMILY("macsec"); MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); MODULE_LICENSE("GPL v2");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov */ #ifndef _LINUX_RADIX_TREE_H #define _LINUX_RADIX_TREE_H #include <linux/bitops.h> #include <linux/gfp_types.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/math.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/xarray.h> #include <linux/local_lock.h> /* Keep unconverted code working */ #define radix_tree_root xarray #define radix_tree_node xa_node struct radix_tree_preload { local_lock_t lock; unsigned nr; /* nodes->parent points to next preallocated node */ struct radix_tree_node *nodes; }; DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads); /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: * * 00 - data pointer * 10 - internal entry * x1 - value entry * * The internal entry may be a pointer to the next level in the tree, a * sibling entry, or an indicator that the entry in this slot has been moved * to another location in the tree and the lookup should be restarted. While * NULL fits the 'data pointer' pattern, it means that there is no entry in * the tree for this index (no matter what level of the tree it is found at). * This means that storing a NULL entry in the tree is the same as deleting * the entry from the tree. */ #define RADIX_TREE_ENTRY_MASK 3UL #define RADIX_TREE_INTERNAL_NODE 2UL static inline bool radix_tree_is_internal_node(void *ptr) { return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) == RADIX_TREE_INTERNAL_NODE; } /*** radix-tree API starts here ***/ #define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) #define RADIX_TREE_MAX_TAGS XA_MAX_MARKS #define RADIX_TREE_TAG_LONGS XA_MARK_LONGS #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) /* The IDR tag is stored in the low bits of xa_flags */ #define ROOT_IS_IDR ((__force gfp_t)4) /* The top bits of xa_flags are used to store the root tags */ #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) #define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask) #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(name, mask) #define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask) static inline bool radix_tree_empty(const struct radix_tree_root *root) { return root->xa_head == NULL; } /** * struct radix_tree_iter - radix tree iterator state * * @index: index of current slot * @next_index: one beyond the last index for this chunk * @tags: bit-mask for tag-iterating * @node: node that contains current slot * * This radix tree iterator works in terms of "chunks" of slots. A chunk is a * subinterval of slots contained within one radix tree leaf node. It is * described by a pointer to its first slot and a struct radix_tree_iter * which holds the chunk's position in the tree and its size. For tagged * iteration radix_tree_iter also holds the slots' bit-mask for one chosen * radix tree tag. */ struct radix_tree_iter { unsigned long index; unsigned long next_index; unsigned long tags; struct radix_tree_node *node; }; /** * Radix-tree synchronization * * The radix-tree API requires that users provide all synchronisation (with * specific exceptions, noted below). * * Synchronization of access to the data items being stored in the tree, and * management of their lifetimes must be completely managed by API users. * * For API usage, in general, * - any function _modifying_ the tree or tags (inserting or deleting * items, setting or clearing tags) must exclude other modifications, and * exclude any functions reading the tree. * - any function _reading_ the tree or tags (looking up items or tags, * gang lookups) must exclude modifications to the tree, but may occur * concurrently with other readers. * * The notable exceptions to this rule are the following functions: * __radix_tree_lookup * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_tag_get * radix_tree_gang_lookup * radix_tree_gang_lookup_tag * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * * The first 7 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. * * It is still required that the caller manage the synchronization and lifetimes * of the items. So if RCU lock-free lookups are used, typically this would mean * that the items have their own locks, or are amenable to lock-free access; and * that the items are freed by RCU (or only freed after having been deleted from * the radix tree *and* a synchronize_rcu() grace period). * * (Note, rcu_assign_pointer and rcu_dereference are not needed to control * access to data items when inserting into or looking up from the radix tree) * * Note that the value returned by radix_tree_tag_get() may not be relied upon * if only the RCU read lock is held. Functions to set/clear tags and to * delete nodes running concurrently with it may affect its result such that * two consecutive reads in the same locked section may return different * values. If reliability is required, modification functions must also be * excluded from concurrency. * * radix_tree_tagged is able to be called without locking or RCU. */ /** * radix_tree_deref_slot - dereference a slot * @slot: slot pointer, returned by radix_tree_lookup_slot * * For use with radix_tree_lookup_slot(). Caller must hold tree at least read * locked across slot lookup and dereference. Not required if write lock is * held (ie. items cannot be concurrently inserted). * * radix_tree_deref_retry must be used to confirm validity of the pointer if * only the read lock is held. * * Return: entry stored in that slot. */ static inline void *radix_tree_deref_slot(void __rcu **slot) { return rcu_dereference(*slot); } /** * radix_tree_deref_slot_protected - dereference a slot with tree lock held * @slot: slot pointer, returned by radix_tree_lookup_slot * * Similar to radix_tree_deref_slot. The caller does not hold the RCU read * lock but it must hold the tree lock to prevent parallel updates. * * Return: entry stored in that slot. */ static inline void *radix_tree_deref_slot_protected(void __rcu **slot, spinlock_t *treelock) { return rcu_dereference_protected(*slot, lockdep_is_held(treelock)); } /** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot * Returns: 0 if retry is not required, otherwise retry is required * * radix_tree_deref_retry must be used with radix_tree_deref_slot. */ static inline int radix_tree_deref_retry(void *arg) { return unlikely(radix_tree_is_internal_node(arg)); } /** * radix_tree_exception - radix_tree_deref_slot returned either exception? * @arg: value returned by radix_tree_deref_slot * Returns: 0 if well-aligned pointer, non-0 if either kind of exception. */ static inline int radix_tree_exception(void *arg) { return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } int radix_tree_insert(struct radix_tree_root *, unsigned long index, void *); void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp); void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, void __rcu **slot, void *entry); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *, unsigned long index, unsigned int tag); void *radix_tree_tag_clear(struct radix_tree_root *, unsigned long index, unsigned int tag); int radix_tree_tag_get(const struct radix_tree_root *, unsigned long index, unsigned int tag); void radix_tree_iter_tag_clear(struct radix_tree_root *, const struct radix_tree_iter *iter, unsigned int tag); unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag); unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag); int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag); static inline void radix_tree_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max); enum { RADIX_TREE_ITER_TAG_MASK = 0x0f, /* tag index in lower nybble */ RADIX_TREE_ITER_TAGGED = 0x10, /* lookup tagged slots */ RADIX_TREE_ITER_CONTIG = 0x20, /* stop at first hole */ }; /** * radix_tree_iter_init - initialize radix tree iterator * * @iter: pointer to iterator state * @start: iteration starting index * Returns: NULL */ static __always_inline void __rcu ** radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) { /* * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it * in the case of a successful tagged chunk lookup. If the lookup was * unsuccessful or non-tagged then nobody cares about ->tags. * * Set index to zero to bypass next_index overflow protection. * See the comment in radix_tree_next_chunk() for details. */ iter->index = 0; iter->next_index = start; return NULL; } /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if there no more left * * This function looks up the next chunk in the radix tree starting from * @iter->next_index. It returns a pointer to the chunk's first slot. * Also it fills @iter with data about chunk: position in the tree (index), * its end (next_index), and constructs a bit mask for tagged iterating (tags). */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *, struct radix_tree_iter *iter, unsigned flags); /** * radix_tree_iter_lookup - look up an index in the radix tree * @root: radix tree root * @iter: iterator state * @index: key to look up * * If @index is present in the radix tree, this function returns the slot * containing it and updates @iter to describe the entry. If @index is not * present, it returns NULL. */ static inline void __rcu ** radix_tree_iter_lookup(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned long index) { radix_tree_iter_init(iter, index); return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG); } /** * radix_tree_iter_retry - retry this chunk of the iteration * @iter: iterator state * * If we iterate over a tree protected only by the RCU lock, a race * against deletion or creation may result in seeing a slot for which * radix_tree_deref_retry() returns true. If so, call this function * and continue the iteration. */ static inline __must_check void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter) { iter->next_index = iter->index; iter->tags = 0; return NULL; } static inline unsigned long __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) { return iter->index + slots; } /** * radix_tree_iter_resume - resume iterating when the chunk may be invalid * @slot: pointer to current slot * @iter: iterator state * Returns: New slot pointer * * If the iterator needs to release then reacquire a lock, the chunk may * have been invalidated by an insertion or deletion. Call this function * before releasing the lock to continue the iteration from the next index. */ void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter); /** * radix_tree_chunk_size - get current chunk size * * @iter: pointer to radix tree iterator * Returns: current chunk size */ static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; } /** * radix_tree_next_slot - find next slot in chunk * * @slot: pointer to current slot * @iter: pointer to iterator state * @flags: RADIX_TREE_ITER_*, should be constant * Returns: pointer to next slot, or NULL if there no more left * * This function updates @iter->index in the case of a successful lookup. * For tagged lookup it also eats @iter->tags. * * There are several cases where 'slot' can be passed in as NULL to this * function. These cases result from the use of radix_tree_iter_resume() or * radix_tree_iter_retry(). In these cases we don't end up dereferencing * 'slot' because either: * a) we are doing tagged iteration and iter->tags has been set to 0, or * b) we are doing non-tagged iteration, and iter->index and iter->next_index * have been set up so that radix_tree_chunk_size() returns 1 or 0. */ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, struct radix_tree_iter *iter, unsigned flags) { if (flags & RADIX_TREE_ITER_TAGGED) { iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; if (likely(iter->tags & 1ul)) { iter->index = __radix_tree_iter_add(iter, 1); slot++; goto found; } if (!(flags & RADIX_TREE_ITER_CONTIG)) { unsigned offset = __ffs(iter->tags); iter->tags >>= offset++; iter->index = __radix_tree_iter_add(iter, offset); slot += offset; goto found; } } else { long count = radix_tree_chunk_size(iter); while (--count > 0) { slot++; iter->index = __radix_tree_iter_add(iter, 1); if (likely(*slot)) goto found; if (flags & RADIX_TREE_ITER_CONTIG) { /* forbid switching to the next chunk */ iter->next_index = 0; break; } } } return NULL; found: return slot; } /** * radix_tree_for_each_slot - iterate over non-empty slots * * @slot: the void** variable for pointer to slot * @root: the struct radix_tree_root pointer * @iter: the struct radix_tree_iter pointer * @start: iteration starting index * * @slot points to radix tree slot, @iter->index contains its index. */ #define radix_tree_for_each_slot(slot, root, iter, start) \ for (slot = radix_tree_iter_init(iter, start) ; \ slot || (slot = radix_tree_next_chunk(root, iter, 0)) ; \ slot = radix_tree_next_slot(slot, iter, 0)) /** * radix_tree_for_each_tagged - iterate over tagged slots * * @slot: the void** variable for pointer to slot * @root: the struct radix_tree_root pointer * @iter: the struct radix_tree_iter pointer * @start: iteration starting index * @tag: tag index * * @slot points to radix tree slot, @iter->index contains its index. */ #define radix_tree_for_each_tagged(slot, root, iter, start, tag) \ for (slot = radix_tree_iter_init(iter, start) ; \ slot || (slot = radix_tree_next_chunk(root, iter, \ RADIX_TREE_ITER_TAGGED | tag)) ; \ slot = radix_tree_next_slot(slot, iter, \ RADIX_TREE_ITER_TAGGED | tag)) #endif /* _LINUX_RADIX_TREE_H */
1486 211 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 /* SPDX-License-Identifier: GPL-2.0 */ /* * security/tomoyo/common.h * * Header file for TOMOYO. * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #ifndef _SECURITY_TOMOYO_COMMON_H #define _SECURITY_TOMOYO_COMMON_H #define pr_fmt(fmt) fmt #include <linux/ctype.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/list.h> #include <linux/cred.h> #include <linux/poll.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/un.h> #include <linux/lsm_hooks.h> #include <net/sock.h> #include <net/af_unix.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/udp.h> /********** Constants definitions. **********/ /* * TOMOYO uses this hash only when appending a string into the string * table. Frequency of appending strings is very low. So we don't need * large (e.g. 64k) hash size. 256 will be sufficient. */ #define TOMOYO_HASH_BITS 8 #define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS) /* * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET. * Therefore, we don't need SOCK_MAX. */ #define TOMOYO_SOCK_MAX 6 #define TOMOYO_EXEC_TMPSIZE 4096 /* Garbage collector is trying to kfree() this element. */ #define TOMOYO_GC_IN_PROGRESS -1 /* Profile number is an integer between 0 and 255. */ #define TOMOYO_MAX_PROFILES 256 /* Group number is an integer between 0 and 255. */ #define TOMOYO_MAX_ACL_GROUPS 256 /* Index numbers for "struct tomoyo_condition". */ enum tomoyo_conditions_index { TOMOYO_TASK_UID, /* current_uid() */ TOMOYO_TASK_EUID, /* current_euid() */ TOMOYO_TASK_SUID, /* current_suid() */ TOMOYO_TASK_FSUID, /* current_fsuid() */ TOMOYO_TASK_GID, /* current_gid() */ TOMOYO_TASK_EGID, /* current_egid() */ TOMOYO_TASK_SGID, /* current_sgid() */ TOMOYO_TASK_FSGID, /* current_fsgid() */ TOMOYO_TASK_PID, /* sys_getpid() */ TOMOYO_TASK_PPID, /* sys_getppid() */ TOMOYO_EXEC_ARGC, /* "struct linux_binprm *"->argc */ TOMOYO_EXEC_ENVC, /* "struct linux_binprm *"->envc */ TOMOYO_TYPE_IS_SOCKET, /* S_IFSOCK */ TOMOYO_TYPE_IS_SYMLINK, /* S_IFLNK */ TOMOYO_TYPE_IS_FILE, /* S_IFREG */ TOMOYO_TYPE_IS_BLOCK_DEV, /* S_IFBLK */ TOMOYO_TYPE_IS_DIRECTORY, /* S_IFDIR */ TOMOYO_TYPE_IS_CHAR_DEV, /* S_IFCHR */ TOMOYO_TYPE_IS_FIFO, /* S_IFIFO */ TOMOYO_MODE_SETUID, /* S_ISUID */ TOMOYO_MODE_SETGID, /* S_ISGID */ TOMOYO_MODE_STICKY, /* S_ISVTX */ TOMOYO_MODE_OWNER_READ, /* S_IRUSR */ TOMOYO_MODE_OWNER_WRITE, /* S_IWUSR */ TOMOYO_MODE_OWNER_EXECUTE, /* S_IXUSR */ TOMOYO_MODE_GROUP_READ, /* S_IRGRP */ TOMOYO_MODE_GROUP_WRITE, /* S_IWGRP */ TOMOYO_MODE_GROUP_EXECUTE, /* S_IXGRP */ TOMOYO_MODE_OTHERS_READ, /* S_IROTH */ TOMOYO_MODE_OTHERS_WRITE, /* S_IWOTH */ TOMOYO_MODE_OTHERS_EXECUTE, /* S_IXOTH */ TOMOYO_EXEC_REALPATH, TOMOYO_SYMLINK_TARGET, TOMOYO_PATH1_UID, TOMOYO_PATH1_GID, TOMOYO_PATH1_INO, TOMOYO_PATH1_MAJOR, TOMOYO_PATH1_MINOR, TOMOYO_PATH1_PERM, TOMOYO_PATH1_TYPE, TOMOYO_PATH1_DEV_MAJOR, TOMOYO_PATH1_DEV_MINOR, TOMOYO_PATH2_UID, TOMOYO_PATH2_GID, TOMOYO_PATH2_INO, TOMOYO_PATH2_MAJOR, TOMOYO_PATH2_MINOR, TOMOYO_PATH2_PERM, TOMOYO_PATH2_TYPE, TOMOYO_PATH2_DEV_MAJOR, TOMOYO_PATH2_DEV_MINOR, TOMOYO_PATH1_PARENT_UID, TOMOYO_PATH1_PARENT_GID, TOMOYO_PATH1_PARENT_INO, TOMOYO_PATH1_PARENT_PERM, TOMOYO_PATH2_PARENT_UID, TOMOYO_PATH2_PARENT_GID, TOMOYO_PATH2_PARENT_INO, TOMOYO_PATH2_PARENT_PERM, TOMOYO_MAX_CONDITION_KEYWORD, TOMOYO_NUMBER_UNION, TOMOYO_NAME_UNION, TOMOYO_ARGV_ENTRY, TOMOYO_ENVP_ENTRY, }; /* Index numbers for stat(). */ enum tomoyo_path_stat_index { /* Do not change this order. */ TOMOYO_PATH1, TOMOYO_PATH1_PARENT, TOMOYO_PATH2, TOMOYO_PATH2_PARENT, TOMOYO_MAX_PATH_STAT }; /* Index numbers for operation mode. */ enum tomoyo_mode_index { TOMOYO_CONFIG_DISABLED, TOMOYO_CONFIG_LEARNING, TOMOYO_CONFIG_PERMISSIVE, TOMOYO_CONFIG_ENFORCING, TOMOYO_CONFIG_MAX_MODE, TOMOYO_CONFIG_WANT_REJECT_LOG = 64, TOMOYO_CONFIG_WANT_GRANT_LOG = 128, TOMOYO_CONFIG_USE_DEFAULT = 255, }; /* Index numbers for entry type. */ enum tomoyo_policy_id { TOMOYO_ID_GROUP, TOMOYO_ID_ADDRESS_GROUP, TOMOYO_ID_PATH_GROUP, TOMOYO_ID_NUMBER_GROUP, TOMOYO_ID_TRANSITION_CONTROL, TOMOYO_ID_AGGREGATOR, TOMOYO_ID_MANAGER, TOMOYO_ID_CONDITION, TOMOYO_ID_NAME, TOMOYO_ID_ACL, TOMOYO_ID_DOMAIN, TOMOYO_MAX_POLICY }; /* Index numbers for domain's attributes. */ enum tomoyo_domain_info_flags_index { /* Quota warnning flag. */ TOMOYO_DIF_QUOTA_WARNED, /* * This domain was unable to create a new domain at * tomoyo_find_next_domain() because the name of the domain to be * created was too long or it could not allocate memory. * More than one process continued execve() without domain transition. */ TOMOYO_DIF_TRANSITION_FAILED, TOMOYO_MAX_DOMAIN_INFO_FLAGS }; /* Index numbers for audit type. */ enum tomoyo_grant_log { /* Follow profile's configuration. */ TOMOYO_GRANTLOG_AUTO, /* Do not generate grant log. */ TOMOYO_GRANTLOG_NO, /* Generate grant_log. */ TOMOYO_GRANTLOG_YES, }; /* Index numbers for group entries. */ enum tomoyo_group_id { TOMOYO_PATH_GROUP, TOMOYO_NUMBER_GROUP, TOMOYO_ADDRESS_GROUP, TOMOYO_MAX_GROUP }; /* Index numbers for type of numeric values. */ enum tomoyo_value_type { TOMOYO_VALUE_TYPE_INVALID, TOMOYO_VALUE_TYPE_DECIMAL, TOMOYO_VALUE_TYPE_OCTAL, TOMOYO_VALUE_TYPE_HEXADECIMAL, }; /* Index numbers for domain transition control keywords. */ enum tomoyo_transition_type { /* Do not change this order, */ TOMOYO_TRANSITION_CONTROL_NO_RESET, TOMOYO_TRANSITION_CONTROL_RESET, TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE, TOMOYO_TRANSITION_CONTROL_INITIALIZE, TOMOYO_TRANSITION_CONTROL_NO_KEEP, TOMOYO_TRANSITION_CONTROL_KEEP, TOMOYO_MAX_TRANSITION_TYPE }; /* Index numbers for Access Controls. */ enum tomoyo_acl_entry_type_index { TOMOYO_TYPE_PATH_ACL, TOMOYO_TYPE_PATH2_ACL, TOMOYO_TYPE_PATH_NUMBER_ACL, TOMOYO_TYPE_MKDEV_ACL, TOMOYO_TYPE_MOUNT_ACL, TOMOYO_TYPE_INET_ACL, TOMOYO_TYPE_UNIX_ACL, TOMOYO_TYPE_ENV_ACL, TOMOYO_TYPE_MANUAL_TASK_ACL, }; /* Index numbers for access controls with one pathname. */ enum tomoyo_path_acl_index { TOMOYO_TYPE_EXECUTE, TOMOYO_TYPE_READ, TOMOYO_TYPE_WRITE, TOMOYO_TYPE_APPEND, TOMOYO_TYPE_UNLINK, TOMOYO_TYPE_GETATTR, TOMOYO_TYPE_RMDIR, TOMOYO_TYPE_TRUNCATE, TOMOYO_TYPE_SYMLINK, TOMOYO_TYPE_CHROOT, TOMOYO_TYPE_UMOUNT, TOMOYO_MAX_PATH_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_memory_stat_type { TOMOYO_MEMORY_POLICY, TOMOYO_MEMORY_AUDIT, TOMOYO_MEMORY_QUERY, TOMOYO_MAX_MEMORY_STAT }; enum tomoyo_mkdev_acl_index { TOMOYO_TYPE_MKBLOCK, TOMOYO_TYPE_MKCHAR, TOMOYO_MAX_MKDEV_OPERATION }; /* Index numbers for socket operations. */ enum tomoyo_network_acl_index { TOMOYO_NETWORK_BIND, /* bind() operation. */ TOMOYO_NETWORK_LISTEN, /* listen() operation. */ TOMOYO_NETWORK_CONNECT, /* connect() operation. */ TOMOYO_NETWORK_SEND, /* send() operation. */ TOMOYO_MAX_NETWORK_OPERATION }; /* Index numbers for access controls with two pathnames. */ enum tomoyo_path2_acl_index { TOMOYO_TYPE_LINK, TOMOYO_TYPE_RENAME, TOMOYO_TYPE_PIVOT_ROOT, TOMOYO_MAX_PATH2_OPERATION }; /* Index numbers for access controls with one pathname and one number. */ enum tomoyo_path_number_acl_index { TOMOYO_TYPE_CREATE, TOMOYO_TYPE_MKDIR, TOMOYO_TYPE_MKFIFO, TOMOYO_TYPE_MKSOCK, TOMOYO_TYPE_IOCTL, TOMOYO_TYPE_CHMOD, TOMOYO_TYPE_CHOWN, TOMOYO_TYPE_CHGRP, TOMOYO_MAX_PATH_NUMBER_OPERATION }; /* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */ enum tomoyo_securityfs_interface_index { TOMOYO_DOMAINPOLICY, TOMOYO_EXCEPTIONPOLICY, TOMOYO_PROCESS_STATUS, TOMOYO_STAT, TOMOYO_AUDIT, TOMOYO_VERSION, TOMOYO_PROFILE, TOMOYO_QUERY, TOMOYO_MANAGER }; /* Index numbers for special mount operations. */ enum tomoyo_special_mount { TOMOYO_MOUNT_BIND, /* mount --bind /source /dest */ TOMOYO_MOUNT_MOVE, /* mount --move /old /new */ TOMOYO_MOUNT_REMOUNT, /* mount -o remount /dir */ TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */ TOMOYO_MOUNT_MAKE_PRIVATE, /* mount --make-private /dir */ TOMOYO_MOUNT_MAKE_SLAVE, /* mount --make-slave /dir */ TOMOYO_MOUNT_MAKE_SHARED, /* mount --make-shared /dir */ TOMOYO_MAX_SPECIAL_MOUNT }; /* Index numbers for functionality. */ enum tomoyo_mac_index { TOMOYO_MAC_FILE_EXECUTE, TOMOYO_MAC_FILE_OPEN, TOMOYO_MAC_FILE_CREATE, TOMOYO_MAC_FILE_UNLINK, TOMOYO_MAC_FILE_GETATTR, TOMOYO_MAC_FILE_MKDIR, TOMOYO_MAC_FILE_RMDIR, TOMOYO_MAC_FILE_MKFIFO, TOMOYO_MAC_FILE_MKSOCK, TOMOYO_MAC_FILE_TRUNCATE, TOMOYO_MAC_FILE_SYMLINK, TOMOYO_MAC_FILE_MKBLOCK, TOMOYO_MAC_FILE_MKCHAR, TOMOYO_MAC_FILE_LINK, TOMOYO_MAC_FILE_RENAME, TOMOYO_MAC_FILE_CHMOD, TOMOYO_MAC_FILE_CHOWN, TOMOYO_MAC_FILE_CHGRP, TOMOYO_MAC_FILE_IOCTL, TOMOYO_MAC_FILE_CHROOT, TOMOYO_MAC_FILE_MOUNT, TOMOYO_MAC_FILE_UMOUNT, TOMOYO_MAC_FILE_PIVOT_ROOT, TOMOYO_MAC_NETWORK_INET_STREAM_BIND, TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN, TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT, TOMOYO_MAC_NETWORK_INET_DGRAM_BIND, TOMOYO_MAC_NETWORK_INET_DGRAM_SEND, TOMOYO_MAC_NETWORK_INET_RAW_BIND, TOMOYO_MAC_NETWORK_INET_RAW_SEND, TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND, TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN, TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT, TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND, TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN, TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT, TOMOYO_MAC_ENVIRON, TOMOYO_MAX_MAC_INDEX }; /* Index numbers for category of functionality. */ enum tomoyo_mac_category_index { TOMOYO_MAC_CATEGORY_FILE, TOMOYO_MAC_CATEGORY_NETWORK, TOMOYO_MAC_CATEGORY_MISC, TOMOYO_MAX_MAC_CATEGORY_INDEX }; /* * Retry this request. Returned by tomoyo_supervisor() if policy violation has * occurred in enforcing mode and the userspace daemon decided to retry. * * We must choose a positive value in order to distinguish "granted" (which is * 0) and "rejected" (which is a negative value) and "retry". */ #define TOMOYO_RETRY_REQUEST 1 /* Index numbers for /sys/kernel/security/tomoyo/stat interface. */ enum tomoyo_policy_stat_type { /* Do not change this order. */ TOMOYO_STAT_POLICY_UPDATES, TOMOYO_STAT_POLICY_LEARNING, /* == TOMOYO_CONFIG_LEARNING */ TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */ TOMOYO_STAT_POLICY_ENFORCING, /* == TOMOYO_CONFIG_ENFORCING */ TOMOYO_MAX_POLICY_STAT }; /* Index numbers for profile's PREFERENCE values. */ enum tomoyo_pref_index { TOMOYO_PREF_MAX_AUDIT_LOG, TOMOYO_PREF_MAX_LEARNING_ENTRY, TOMOYO_MAX_PREF }; /********** Structure definitions. **********/ /* Common header for holding ACL entries. */ struct tomoyo_acl_head { struct list_head list; s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ } __packed; /* Common header for shared entries. */ struct tomoyo_shared_acl_head { struct list_head list; atomic_t users; } __packed; struct tomoyo_policy_namespace; /* Structure for request info. */ struct tomoyo_request_info { /* * For holding parameters specific to operations which deal files. * NULL if not dealing files. */ struct tomoyo_obj_info *obj; /* * For holding parameters specific to execve() request. * NULL if not dealing execve(). */ struct tomoyo_execve *ee; struct tomoyo_domain_info *domain; /* For holding parameters. */ union { struct { const struct tomoyo_path_info *filename; /* For using wildcards at tomoyo_find_next_domain(). */ const struct tomoyo_path_info *matched_path; /* One of values in "enum tomoyo_path_acl_index". */ u8 operation; } path; struct { const struct tomoyo_path_info *filename1; const struct tomoyo_path_info *filename2; /* One of values in "enum tomoyo_path2_acl_index". */ u8 operation; } path2; struct { const struct tomoyo_path_info *filename; unsigned int mode; unsigned int major; unsigned int minor; /* One of values in "enum tomoyo_mkdev_acl_index". */ u8 operation; } mkdev; struct { const struct tomoyo_path_info *filename; unsigned long number; /* * One of values in * "enum tomoyo_path_number_acl_index". */ u8 operation; } path_number; struct { const struct tomoyo_path_info *name; } environ; struct { const __be32 *address; u16 port; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; bool is_ipv6; } inet_network; struct { const struct tomoyo_path_info *address; /* One of values smaller than TOMOYO_SOCK_MAX. */ u8 protocol; /* One of values in "enum tomoyo_network_acl_index". */ u8 operation; } unix_network; struct { const struct tomoyo_path_info *type; const struct tomoyo_path_info *dir; const struct tomoyo_path_info *dev; unsigned long flags; int need_dev; } mount; struct { const struct tomoyo_path_info *domainname; } task; } param; struct tomoyo_acl_info *matched_acl; u8 param_type; bool granted; u8 retry; u8 profile; u8 mode; /* One of tomoyo_mode_index . */ u8 type; }; /* Structure for holding a token. */ struct tomoyo_path_info { const char *name; u32 hash; /* = full_name_hash(name, strlen(name)) */ u16 const_len; /* = tomoyo_const_part_length(name) */ bool is_dir; /* = tomoyo_strendswith(name, "/") */ bool is_patterned; /* = tomoyo_path_contains_pattern(name) */ }; /* Structure for holding string data. */ struct tomoyo_name { struct tomoyo_shared_acl_head head; struct tomoyo_path_info entry; }; /* Structure for holding a word. */ struct tomoyo_name_union { /* Either @filename or @group is NULL. */ const struct tomoyo_path_info *filename; struct tomoyo_group *group; }; /* Structure for holding a number. */ struct tomoyo_number_union { unsigned long values[2]; struct tomoyo_group *group; /* Maybe NULL. */ /* One of values in "enum tomoyo_value_type". */ u8 value_type[2]; }; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union { struct in6_addr ip[2]; /* Big endian. */ struct tomoyo_group *group; /* Pointer to address group. */ bool is_ipv6; /* Valid only if @group == NULL. */ }; /* Structure for "path_group"/"number_group"/"address_group" directive. */ struct tomoyo_group { struct tomoyo_shared_acl_head head; const struct tomoyo_path_info *group_name; struct list_head member_list; }; /* Structure for "path_group" directive. */ struct tomoyo_path_group { struct tomoyo_acl_head head; const struct tomoyo_path_info *member_name; }; /* Structure for "number_group" directive. */ struct tomoyo_number_group { struct tomoyo_acl_head head; struct tomoyo_number_union number; }; /* Structure for "address_group" directive. */ struct tomoyo_address_group { struct tomoyo_acl_head head; /* Structure for holding an IP address. */ struct tomoyo_ipaddr_union address; }; /* Subset of "struct stat". Used by conditional ACL and audit logs. */ struct tomoyo_mini_stat { kuid_t uid; kgid_t gid; ino_t ino; umode_t mode; dev_t dev; dev_t rdev; }; /* Structure for dumping argv[] and envp[] of "struct linux_binprm". */ struct tomoyo_page_dump { struct page *page; /* Previously dumped page. */ char *data; /* Contents of "page". Size is PAGE_SIZE. */ }; /* Structure for attribute checks in addition to pathname checks. */ struct tomoyo_obj_info { /* * True if tomoyo_get_attributes() was already called, false otherwise. */ bool validate_done; /* True if @stat[] is valid. */ bool stat_valid[TOMOYO_MAX_PATH_STAT]; /* First pathname. Initialized with { NULL, NULL } if no path. */ struct path path1; /* Second pathname. Initialized with { NULL, NULL } if no path. */ struct path path2; /* * Information on @path1, @path1's parent directory, @path2, @path2's * parent directory. */ struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT]; /* * Content of symbolic link to be created. NULL for operations other * than symlink(). */ struct tomoyo_path_info *symlink_target; }; /* Structure for argv[]. */ struct tomoyo_argv { unsigned long index; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for envp[]. */ struct tomoyo_envp { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; bool is_not; }; /* Structure for execve() operation. */ struct tomoyo_execve { struct tomoyo_request_info r; struct tomoyo_obj_info obj; struct linux_binprm *bprm; const struct tomoyo_path_info *transition; /* For dumping argv[] and envp[]. */ struct tomoyo_page_dump dump; /* For temporary use. */ char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */ }; /* Structure for entries which follows "struct tomoyo_condition". */ struct tomoyo_condition_element { /* * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail * of the array of this struct. */ u8 left; /* * Right hand operand. A "struct tomoyo_number_union" for * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for * TOMOYO_NAME_UNION is attached to the tail of the array of this * struct. */ u8 right; /* Equation operator. True if equals or overlaps, false otherwise. */ bool equals; }; /* Structure for optional arguments. */ struct tomoyo_condition { struct tomoyo_shared_acl_head head; u32 size; /* Memory size allocated for this entry. */ u16 condc; /* Number of conditions in this struct. */ u16 numbers_count; /* Number of "struct tomoyo_number_union values". */ u16 names_count; /* Number of "struct tomoyo_name_union names". */ u16 argc; /* Number of "struct tomoyo_argv". */ u16 envc; /* Number of "struct tomoyo_envp". */ u8 grant_log; /* One of values in "enum tomoyo_grant_log". */ const struct tomoyo_path_info *transit; /* Maybe NULL. */ /* * struct tomoyo_condition_element condition[condc]; * struct tomoyo_number_union values[numbers_count]; * struct tomoyo_name_union names[names_count]; * struct tomoyo_argv argv[argc]; * struct tomoyo_envp envp[envc]; */ }; /* Common header for individual entries. */ struct tomoyo_acl_info { struct list_head list; struct tomoyo_condition *cond; /* Maybe NULL. */ s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */ u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */ } __packed; /* Structure for domain information. */ struct tomoyo_domain_info { struct list_head list; struct list_head acl_info_list; /* Name of this domain. Never NULL. */ const struct tomoyo_path_info *domainname; /* Namespace for this domain. Never NULL. */ struct tomoyo_policy_namespace *ns; /* Group numbers to use. */ unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG]; u8 profile; /* Profile number to use. */ bool is_deleted; /* Delete flag. */ bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; atomic_t users; /* Number of referring tasks. */ }; /* * Structure for "task manual_domain_transition" directive. */ struct tomoyo_task_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */ /* Pointer to domainname. */ const struct tomoyo_path_info *domainname; }; /* * Structure for "file execute", "file read", "file write", "file append", * "file unlink", "file getattr", "file rmdir", "file truncate", * "file symlink", "file chroot" and "file unmount" directive. */ struct tomoyo_path_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */ u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */ struct tomoyo_name_union name; }; /* * Structure for "file create", "file mkdir", "file mkfifo", "file mksock", * "file ioctl", "file chmod", "file chown" and "file chgrp" directive. */ struct tomoyo_path_number_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */ /* Bitmask of values in "enum tomoyo_path_number_acl_index". */ u8 perm; struct tomoyo_name_union name; struct tomoyo_number_union number; }; /* Structure for "file mkblock" and "file mkchar" directive. */ struct tomoyo_mkdev_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */ struct tomoyo_name_union name; struct tomoyo_number_union mode; struct tomoyo_number_union major; struct tomoyo_number_union minor; }; /* * Structure for "file rename", "file link" and "file pivot_root" directive. */ struct tomoyo_path2_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */ u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */ struct tomoyo_name_union name1; struct tomoyo_name_union name2; }; /* Structure for "file mount" directive. */ struct tomoyo_mount_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */ struct tomoyo_name_union dev_name; struct tomoyo_name_union dir_name; struct tomoyo_name_union fs_type; struct tomoyo_number_union flags; }; /* Structure for "misc env" directive in domain policy. */ struct tomoyo_env_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_ENV_ACL */ const struct tomoyo_path_info *env; /* environment variable */ }; /* Structure for "network inet" directive. */ struct tomoyo_inet_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_ipaddr_union address; struct tomoyo_number_union port; }; /* Structure for "network unix" directive. */ struct tomoyo_unix_acl { struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */ u8 protocol; u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */ struct tomoyo_name_union name; }; /* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */ struct tomoyo_acl_param { char *data; struct list_head *list; struct tomoyo_policy_namespace *ns; bool is_delete; }; #define TOMOYO_MAX_IO_READ_QUEUE 64 /* * Structure for reading/writing policy via /sys/kernel/security/tomoyo * interfaces. */ struct tomoyo_io_buffer { void (*read)(struct tomoyo_io_buffer *head); int (*write)(struct tomoyo_io_buffer *head); __poll_t (*poll)(struct file *file, poll_table *wait); /* Exclusive lock for this structure. */ struct mutex io_sem; char __user *read_user_buf; size_t read_user_buf_avail; struct { struct list_head *ns; struct list_head *domain; struct list_head *group; struct list_head *acl; size_t avail; unsigned int step; unsigned int query_index; u16 index; u16 cond_index; u8 acl_group_index; u8 cond_step; u8 bit; u8 w_pos; bool eof; bool print_this_domain_only; bool print_transition_related_only; bool print_cond_part; const char *w[TOMOYO_MAX_IO_READ_QUEUE]; } r; struct { struct tomoyo_policy_namespace *ns; /* The position currently writing to. */ struct tomoyo_domain_info *domain; /* Bytes available for writing. */ size_t avail; bool is_delete; } w; /* Buffer for reading. */ char *read_buf; /* Size of read buffer. */ size_t readbuf_size; /* Buffer for writing. */ char *write_buf; /* Size of write buffer. */ size_t writebuf_size; /* Type of this interface. */ enum tomoyo_securityfs_interface_index type; /* Users counter protected by tomoyo_io_buffer_list_lock. */ u8 users; /* List for telling GC not to kfree() elements. */ struct list_head list; }; /* * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/ * "no_keep_domain" keyword. */ struct tomoyo_transition_control { struct tomoyo_acl_head head; u8 type; /* One of values in "enum tomoyo_transition_type". */ /* True if the domainname is tomoyo_get_last_name(). */ bool is_last_name; const struct tomoyo_path_info *domainname; /* Maybe NULL */ const struct tomoyo_path_info *program; /* Maybe NULL */ }; /* Structure for "aggregator" keyword. */ struct tomoyo_aggregator { struct tomoyo_acl_head head; const struct tomoyo_path_info *original_name; const struct tomoyo_path_info *aggregated_name; }; /* Structure for policy manager. */ struct tomoyo_manager { struct tomoyo_acl_head head; /* A path to program or a domainname. */ const struct tomoyo_path_info *manager; }; struct tomoyo_preference { unsigned int learning_max_entry; bool enforcing_verbose; bool learning_verbose; bool permissive_verbose; }; /* Structure for /sys/kernel/security/tomnoyo/profile interface. */ struct tomoyo_profile { const struct tomoyo_path_info *comment; struct tomoyo_preference *learning; struct tomoyo_preference *permissive; struct tomoyo_preference *enforcing; struct tomoyo_preference preference; u8 default_config; u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; unsigned int pref[TOMOYO_MAX_PREF]; }; /* Structure for representing YYYY/MM/DD hh/mm/ss. */ struct tomoyo_time { u16 year; u8 month; u8 day; u8 hour; u8 min; u8 sec; }; /* Structure for policy namespace. */ struct tomoyo_policy_namespace { /* Profile table. Memory is allocated as needed. */ struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES]; /* List of "struct tomoyo_group". */ struct list_head group_list[TOMOYO_MAX_GROUP]; /* List of policy. */ struct list_head policy_list[TOMOYO_MAX_POLICY]; /* The global ACL referred by "use_group" keyword. */ struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS]; /* List for connecting to tomoyo_namespace_list list. */ struct list_head namespace_list; /* Profile version. Currently only 20150505 is defined. */ unsigned int profile_version; /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */ const char *name; }; /* Structure for "struct task_struct"->security. */ struct tomoyo_task { struct tomoyo_domain_info *domain_info; struct tomoyo_domain_info *old_domain_info; }; /********** Function prototypes. **********/ bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address, const struct tomoyo_group *group); bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr); bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond); bool tomoyo_correct_domain(const unsigned char *domainname); bool tomoyo_correct_path(const char *filename); bool tomoyo_correct_word(const char *string); bool tomoyo_domain_def(const unsigned char *buffer); bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r); bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump); bool tomoyo_memory_ok(void *ptr); bool tomoyo_number_matches_group(const unsigned long min, const unsigned long max, const struct tomoyo_group *group); bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param, struct tomoyo_ipaddr_union *ptr); bool tomoyo_parse_name_union(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr); bool tomoyo_parse_number_union(struct tomoyo_acl_param *param, struct tomoyo_number_union *ptr); bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename, const struct tomoyo_path_info *pattern); bool tomoyo_permstr(const char *string, const char *keyword); bool tomoyo_str_starts(char **src, const char *find); char *tomoyo_encode(const char *str); char *tomoyo_encode2(const char *str, int str_len); char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); char *tomoyo_read_token(struct tomoyo_acl_param *param); char *tomoyo_realpath_from_path(const struct path *path); char *tomoyo_realpath_nofollow(const char *pathname); const char *tomoyo_get_exe(void); const struct tomoyo_path_info *tomoyo_compare_name_union (const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr); const struct tomoyo_path_info *tomoyo_get_domainname (struct tomoyo_acl_param *param); const struct tomoyo_path_info *tomoyo_get_name(const char *name); const struct tomoyo_path_info *tomoyo_path_matches_group (const struct tomoyo_path_info *pathname, const struct tomoyo_group *group); int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag); void tomoyo_close_control(struct tomoyo_io_buffer *head); int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env); int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename); int tomoyo_find_next_domain(struct linux_binprm *bprm); int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index); int tomoyo_init_request_info(struct tomoyo_request_info *r, struct tomoyo_domain_info *domain, const u8 index); int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev); int tomoyo_mount_permission(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data_page); int tomoyo_open_control(const u8 type, struct file *file); int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2); int tomoyo_path_number_perm(const u8 operation, const struct path *path, unsigned long number); int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target); __poll_t tomoyo_poll_control(struct file *file, poll_table *wait); __poll_t tomoyo_poll_log(struct file *file, poll_table *wait); int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_connect_permission(struct socket *sock, struct sockaddr *addr, int addr_len); int tomoyo_socket_listen_permission(struct socket *sock); int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg, int size); int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_info *, const struct tomoyo_acl_info *), bool (*merge_duplicate) (struct tomoyo_acl_info *, struct tomoyo_acl_info *, const bool)); int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate) (const struct tomoyo_acl_head *, const struct tomoyo_acl_head *)); int tomoyo_write_aggregator(struct tomoyo_acl_param *param); int tomoyo_write_file(struct tomoyo_acl_param *param); int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_misc(struct tomoyo_acl_param *param); int tomoyo_write_inet_network(struct tomoyo_acl_param *param); int tomoyo_write_transition_control(struct tomoyo_acl_param *param, const u8 type); int tomoyo_write_unix_network(struct tomoyo_acl_param *param); ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer, const int buffer_len); ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head, const char __user *buffer, const int buffer_len); struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param); struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname, const bool transit); struct tomoyo_domain_info *tomoyo_domain(void); struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname); struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param, const u8 idx); struct tomoyo_policy_namespace *tomoyo_assign_namespace (const char *domainname); struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns, const u8 profile); u8 tomoyo_parse_ulong(unsigned long *result, char **str); void *tomoyo_commit_ok(void *data, const unsigned int size); void __init tomoyo_load_builtin_policy(void); void __init tomoyo_mm_init(void); void tomoyo_check_acl(struct tomoyo_request_info *r, bool (*check_entry)(struct tomoyo_request_info *, const struct tomoyo_acl_info *)); void tomoyo_check_profile(void); void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp); void tomoyo_del_condition(struct list_head *element); void tomoyo_fill_path_info(struct tomoyo_path_info *ptr); void tomoyo_get_attributes(struct tomoyo_obj_info *obj); void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns); void tomoyo_load_policy(const char *filename); void tomoyo_normalize_line(unsigned char *buffer); void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register); void tomoyo_print_ip(char *buf, const unsigned int size, const struct tomoyo_ipaddr_union *ptr); void tomoyo_print_ulong(char *buffer, const int buffer_len, const unsigned long value, const u8 type); void tomoyo_put_name_union(struct tomoyo_name_union *ptr); void tomoyo_put_number_union(struct tomoyo_number_union *ptr); void tomoyo_read_log(struct tomoyo_io_buffer *head); void tomoyo_update_stat(const u8 index); void tomoyo_warn_oom(const char *function); void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...) __printf(2, 3); void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt, va_list args) __printf(3, 0); /********** External variable definitions. **********/ extern bool tomoyo_policy_loaded; extern int tomoyo_enabled; extern const char * const tomoyo_condition_keyword [TOMOYO_MAX_CONDITION_KEYWORD]; extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS]; extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX]; extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE]; extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION]; extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX]; extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION]; extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX]; extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION]; extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION]; extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION]; extern struct list_head tomoyo_condition_list; extern struct list_head tomoyo_domain_list; extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH]; extern struct list_head tomoyo_namespace_list; extern struct mutex tomoyo_policy_lock; extern struct srcu_struct tomoyo_ss; extern struct tomoyo_domain_info tomoyo_kernel_domain; extern struct tomoyo_policy_namespace tomoyo_kernel_namespace; extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT]; extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT]; extern struct lsm_blob_sizes tomoyo_blob_sizes; /********** Inlined functions. **********/ /** * tomoyo_read_lock - Take lock for protecting policy. * * Returns index number for tomoyo_read_unlock(). */ static inline int tomoyo_read_lock(void) { return srcu_read_lock(&tomoyo_ss); } /** * tomoyo_read_unlock - Release lock for protecting policy. * * @idx: Index number returned by tomoyo_read_lock(). * * Returns nothing. */ static inline void tomoyo_read_unlock(int idx) { srcu_read_unlock(&tomoyo_ss, idx); } /** * tomoyo_sys_getppid - Copy of getppid(). * * Returns parent process's PID. * * Alpha does not have getppid() defined. To be able to build this module on * Alpha, I have to copy getppid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getppid(void) { pid_t pid; rcu_read_lock(); pid = task_tgid_vnr(rcu_dereference(current->real_parent)); rcu_read_unlock(); return pid; } /** * tomoyo_sys_getpid - Copy of getpid(). * * Returns current thread's PID. * * Alpha does not have getpid() defined. To be able to build this module on * Alpha, I have to copy getpid() from kernel/timer.c. */ static inline pid_t tomoyo_sys_getpid(void) { return task_tgid_vnr(current); } /** * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure. * * @a: Pointer to "struct tomoyo_path_info". * @b: Pointer to "struct tomoyo_path_info". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a, const struct tomoyo_path_info *b) { return a->hash != b->hash || strcmp(a->name, b->name); } /** * tomoyo_put_name - Drop reference on "struct tomoyo_name". * * @name: Pointer to "struct tomoyo_path_info". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_name(const struct tomoyo_path_info *name) { if (name) { struct tomoyo_name *ptr = container_of(name, typeof(*ptr), entry); atomic_dec(&ptr->head.users); } } /** * tomoyo_put_condition - Drop reference on "struct tomoyo_condition". * * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_condition(struct tomoyo_condition *cond) { if (cond) atomic_dec(&cond->head.users); } /** * tomoyo_put_group - Drop reference on "struct tomoyo_group". * * @group: Pointer to "struct tomoyo_group". Maybe NULL. * * Returns nothing. */ static inline void tomoyo_put_group(struct tomoyo_group *group) { if (group) atomic_dec(&group->head.users); } /** * tomoyo_task - Get "struct tomoyo_task" for specified thread. * * @task - Pointer to "struct task_struct". * * Returns pointer to "struct tomoyo_task" for specified thread. */ static inline struct tomoyo_task *tomoyo_task(struct task_struct *task) { return task->security + tomoyo_blob_sizes.lbs_task; } /** * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry. * * @a: Pointer to "struct tomoyo_name_union". * @b: Pointer to "struct tomoyo_name_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_name_union (const struct tomoyo_name_union *a, const struct tomoyo_name_union *b) { return a->filename == b->filename && a->group == b->group; } /** * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry. * * @a: Pointer to "struct tomoyo_number_union". * @b: Pointer to "struct tomoyo_number_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_number_union (const struct tomoyo_number_union *a, const struct tomoyo_number_union *b) { return a->values[0] == b->values[0] && a->values[1] == b->values[1] && a->group == b->group && a->value_type[0] == b->value_type[0] && a->value_type[1] == b->value_type[1]; } /** * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry. * * @a: Pointer to "struct tomoyo_ipaddr_union". * @b: Pointer to "struct tomoyo_ipaddr_union". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_ipaddr_union (const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b) { return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group && a->is_ipv6 == b->is_ipv6; } /** * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread. * * Returns pointer to "struct tomoyo_policy_namespace" for current thread. */ static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void) { return tomoyo_domain()->ns; } /** * list_for_each_cookie - iterate over a list with cookie. * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_cookie(pos, head) \ if (!pos) \ pos = srcu_dereference((head)->next, &tomoyo_ss); \ for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss)) #endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_taprio.c Time Aware Priority Scheduler * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> * */ #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/tcp.h> #define TAPRIO_STAT_NOT_SET (~0ULL) #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); static struct static_key_false taprio_have_broken_mqprio; static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes */ u64 gate_duration[TC_MAX_QUEUE]; atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time */ ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */ ktime_t end_time; ktime_t next_txtime; int index; u32 gate_mask; u32 interval; u8 command; }; struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule. */ u64 max_open_gate_duration[TC_MAX_QUEUE]; u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; }; struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; struct __tc_taprio_qopt_offload { refcount_t users; struct tc_taprio_qopt_offload offload; }; static void taprio_calculate_gate_durations(struct taprio_sched *q, struct sched_gate_list *sched) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *cur; int tc; list_for_each_entry(entry, &sched->entries, list) { u32 gates_still_open = entry->gate_mask; /* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC. */ cur = entry; do { if (!gates_still_open) break; for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue; if (cur->gate_mask & BIT(tc)) entry->gate_duration[tc] += cur->interval; else gates_still_open &= ~BIT(tc); } cur = list_next_entry_circular(cur, &sched->entries, list); } while (cur != entry); /* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed. */ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] && sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; } } static bool taprio_entry_allows_tx(ktime_t skb_end_time, struct sched_entry *entry, int tc) { return ktime_before(skb_end_time, entry->gate_close_time[tc]); } static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) return KTIME_MAX; return ns_to_ktime(sched->base_time); } static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); switch (tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, tk_offset); } } static ktime_t taprio_get_time(const struct taprio_sched *q) { return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); } kfree(sched); } static void switch_schedules(struct taprio_sched *q, struct sched_gate_list **admin, struct sched_gate_list **oper) { rcu_assign_pointer(q->oper_sched, *admin); rcu_assign_pointer(q->admin_sched, NULL); if (*oper) call_rcu(&(*oper)->rcu, taprio_free_sched_cb); *oper = *admin; *admin = NULL; } /* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) { ktime_t time_since_sched_start; s32 time_elapsed; time_since_sched_start = ktime_sub(time, sched->base_time); div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); return time_elapsed; } static ktime_t get_interval_end_time(struct sched_gate_list *sched, struct sched_gate_list *admin, struct sched_entry *entry, ktime_t intv_start) { s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); ktime_t intv_end, cycle_ext_end, cycle_end; cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); intv_end = ktime_add_ns(intv_start, entry->interval); cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); if (ktime_before(intv_end, cycle_end)) return intv_end; else if (admin && admin != sched && ktime_after(admin->base_time, cycle_end) && ktime_before(admin->base_time, cycle_ext_end)) return admin->base_time; else return cycle_end; } static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } static int duration_to_length(struct taprio_sched *q, u64 duration) { return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); } /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed. */ static void taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); u32 max_sdu_from_user; u32 max_sdu_dynamic; u32 max_sdu; int tc; for (tc = 0; tc < num_tc; tc++) { max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; /* TC gate never closes => keep the queueMaxSDU * selected by the user */ if (sched->max_open_gate_duration[tc] == sched->cycle_time) { max_sdu_dynamic = U32_MAX; } else { u32 max_frm_len; max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative */ if (stab) { max_frm_len -= stab->szopts.overhead; max_frm_len = max_t(int, max_frm_len, dev->hard_header_len + 1); } max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu) max_sdu_dynamic = U32_MAX; } max_sdu = min(max_sdu_dynamic, max_sdu_from_user); if (max_sdu != U32_MAX) { sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; sched->max_sdu[tc] = max_sdu; } else { sched->max_frm_len[tc] = U32_MAX; /* never oversized */ sched->max_sdu[tc] = 0; } } } /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. */ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, struct Qdisc *sch, struct sched_gate_list *sched, struct sched_gate_list *admin, ktime_t time, ktime_t *interval_start, ktime_t *interval_end, bool validate_interval) { ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; struct sched_entry *entry = NULL, *entry_found = NULL; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool entry_available = false; s32 cycle_elapsed; int tc, n; tc = netdev_get_prio_tc_map(dev, skb->priority); packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); *interval_start = 0; *interval_end = 0; if (!sched) return NULL; cycle = sched->cycle_time; cycle_elapsed = get_cycle_time_elapsed(sched, time); curr_intv_end = ktime_sub_ns(time, cycle_elapsed); cycle_end = ktime_add_ns(curr_intv_end, cycle); list_for_each_entry(entry, &sched->entries, list) { curr_intv_start = curr_intv_end; curr_intv_end = get_interval_end_time(sched, admin, entry, curr_intv_start); if (ktime_after(curr_intv_start, cycle_end)) break; if (!(entry->gate_mask & BIT(tc)) || packet_transmit_time > entry->interval) continue; txtime = entry->next_txtime; if (ktime_before(txtime, time) || validate_interval) { transmit_end_time = ktime_add_ns(time, packet_transmit_time); if ((ktime_before(curr_intv_start, time) && ktime_before(transmit_end_time, curr_intv_end)) || (ktime_after(curr_intv_start, time) && !validate_interval)) { entry_found = entry; *interval_start = curr_intv_start; *interval_end = curr_intv_end; break; } else if (!entry_available && !validate_interval) { /* Here, we are just trying to find out the * first available interval in the next cycle. */ entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); } } else if (ktime_before(txtime, earliest_txtime) && !entry_available) { earliest_txtime = txtime; entry_found = entry; n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); *interval_start = ktime_add(curr_intv_start, n * cycle); *interval_end = ktime_add(curr_intv_end, n * cycle); } } return entry_found; } static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t interval_start, interval_end; struct sched_entry *entry; rcu_read_lock(); sched = rcu_dereference(q->oper_sched); admin = rcu_dereference(q->admin_sched); entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, &interval_start, &interval_end, true); rcu_read_unlock(); return entry; } /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct iphdr *iph; struct ipv6hdr _ipv6h; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return 0; } else if (iph->protocol != IPPROTO_TCP) { return 0; } } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { return 0; } return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window. */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed; now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); tcp_tstamp = get_tcp_tstamp(q, skb); minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched); if (admin && ktime_after(minimum_time, admin->base_time)) switch_schedules(q, &admin, &sched); /* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) { txtime = minimum_time; goto done; } len = qdisc_pkt_len(skb); packet_transmit_time = length_to_duration(q, len); do { sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, &interval_start, &interval_end, false); if (!entry) { txtime = 0; goto done; } txtime = entry->next_txtime; txtime = max_t(ktime_t, txtime, minimum_time); txtime = max_t(ktime_t, txtime, interval_start); if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; sched_changed = true; continue; } transmit_end_time = ktime_add(txtime, packet_transmit_time); minimum_time = transmit_end_time; /* Update the txtime of current entry to the next time it's * interval starts. */ if (ktime_after(transmit_end_time, interval_end)) entry->next_txtime = ktime_add(interval_start, sched->cycle_time); } while (sched_changed || ktime_after(transmit_end_time, interval_end)); entry->next_txtime = transmit_end_time; done: rcu_read_unlock(); return txtime; } /* Devices with full offload are expected to honor this in hardware */ static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false; u8 tc; tc = netdev_get_prio_tc_map(dev, prio); rcu_read_lock(); sched = rcu_dereference(q->oper_sched); if (sched && skb->len > sched->max_frm_len[tc]) exceeds = true; rcu_read_unlock(); return exceeds; } static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free); } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; int ret; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; slen += segs->len; /* FIXME: we should be segmenting to a smaller size * rather than dropping these */ if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) ret = qdisc_drop(segs, sch, to_free); else ret = taprio_enqueue_one(segs, sch, child, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { numsegs++; } } if (numsegs > 1) qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); consume_skb(skb); return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue; queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware. */ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child, to_free); return qdisc_drop(skb, sch, to_free); } return taprio_enqueue_one(skb, sch, child, to_free); } static struct sk_buff *taprio_peek(struct Qdisc *sch) { WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); return NULL; } static void taprio_set_budgets(struct taprio_sched *q, struct sched_gate_list *sched, struct sched_entry *entry) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); int tc, budget; for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time) budget = INT_MAX; else budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); atomic_set(&entry->budget[tc], budget); } } /* When an skb is sent, it consumes from the budget of all traffic classes */ static int taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc) { int tc, budget, new_budget = 0; for (tc = 0; tc < num_tc; tc++) { budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed) new_budget = budget; continue; } if (tc == tc_consumed) new_budget = atomic_sub_return(len, &entry->budget[tc]); else atomic_sub(len, &entry->budget[tc]); } return new_budget; } static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct Qdisc *child = q->qdiscs[txq]; int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; ktime_t guard; int prio; int len; u8 tc; if (unlikely(!child)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) goto skip_peek_checks; skb = child->ops->peek(child); if (!skb) return NULL; prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) return NULL; len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && !taprio_entry_allows_tx(guard, entry, tc)) return NULL; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL; skip_peek_checks: skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) { int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count; (*txq)++; if (*txq == offset + count) *txq = offset; } /* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin */ static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc; for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; do { skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], entry, gate_mask); taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); if (q->cur_txq[tc] >= dev->num_tx_queues) q->cur_txq[tc] = first_txq; if (skb) return skb; } while (q->cur_txq[tc] != first_txq); } return NULL; } /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not */ static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i; for (i = 0; i < dev->num_tx_queues; i++) { skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb; } return NULL; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done; if (static_branch_unlikely(&taprio_have_broken_mqprio) && !static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); } else if (static_branch_likely(&taprio_have_working_mqprio) && !static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio) skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: rcu_read_unlock(); return skb; } static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { if (list_is_last(&entry->list, &oper->entries)) return true; if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; } static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t end_time) { ktime_t next_base_time, extension_time; if (!admin) return false; next_base_time = sched_base_time(admin); /* This is the simple case, the end_time would fall after * the next schedule base_time. */ if (ktime_compare(next_base_time, end_time) <= 0) return true; /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change. */ if (ktime_compare(next_base_time, extension_time) <= 0) return true; return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t end_time; int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); oper = rcu_dereference_protected(q->oper_sched, lockdep_is_held(&q->current_entry_lock)); admin = rcu_dereference_protected(q->admin_sched, lockdep_is_held(&q->current_entry_lock)); if (!oper) switch_schedules(q, &admin, &oper); /* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization. */ if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, oper->cycle_time); } else { next = list_next_entry(entry, list); } end_time = ktime_add_ns(entry->end_time, next->interval); end_time = min_t(ktime_t, end_time, oper->cycle_end_time); for (tc = 0; tc < num_tc; tc++) { if (next->gate_duration[tc] == oper->cycle_time) next->gate_close_time[tc] = KTIME_MAX; else next->gate_close_time[tc] = ktime_add_ns(entry->end_time, next->gate_duration[tc]); } if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } next->end_time = end_time; taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); rcu_read_unlock(); return HRTIMER_RESTART; } static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct netlink_range_validation_signed taprio_cycle_time_range = { .min = 0, .max = INT_MAX, }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS), [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, struct sched_entry *entry, struct netlink_ext_ack *extack) { int min_duration = length_to_duration(q, ETH_ZLEN); u32 interval = 0; if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) entry->command = nla_get_u8( tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) entry->gate_mask = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) interval = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); /* The interval should allow at least the minimum ethernet * frame to go out. */ if (interval < min_duration) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; return 0; } static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, struct sched_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_sched_entry(q, tb, entry, extack); } static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; int err, rem; int i = 0; if (!list) return -EINVAL; nla_for_each_nested(n, list, rem) { struct sched_entry *entry; if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } err = parse_sched_entry(q, n, entry, i, extack); if (err < 0) { kfree(entry); return err; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; } static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, struct sched_gate_list *new, struct netlink_ext_ack *extack) { int err = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); return -ENOTSUPP; } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); if (err < 0) return err; if (!new->cycle_time) { struct sched_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; } new->cycle_time = cycle; } if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); return -EINVAL; } taprio_calculate_gate_durations(q, new); return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack, u32 taprio_flags) { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt) { if (!dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } return 0; } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } /* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges. */ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, extack); } static int taprio_get_start_time(struct Qdisc *sch, struct sched_gate_list *sched, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); ktime_t now, base, cycle; s64 n; base = sched_base_time(sched); now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error. */ if (WARN_ON(!cycle)) return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); return 0; } static void setup_first_end_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_end_time = ktime_add_ns(base, cycle); first->end_time = ktime_add_ns(base, first->interval); taprio_set_budgets(q, sched, first); for (tc = 0; tc < num_tc; tc++) { if (first->gate_duration[tc] == sched->cycle_time) first->gate_close_time[tc] = KTIME_MAX; else first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); } rcu_assign_pointer(q->current_entry, NULL); } static void taprio_start_sched(struct Qdisc *sch, ktime_t start, struct sched_gate_list *new) { struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) return; expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; /* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time. */ start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->picos_per_byte), ecmd.base.speed); } static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sched_gate_list *oper, *admin; struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; list_for_each_entry(q, &taprio_list, taprio_list) { if (dev != qdisc_dev(q->root)) continue; taprio_set_picos_per_byte(dev, q); stab = rtnl_dereference(q->root->stab); oper = rtnl_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); admin = rtnl_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); break; } return NOTIFY_DONE; } static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); interval += entry->interval; } } static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), GFP_KERNEL); if (!__offload) return NULL; refcount_set(&__offload->users, 1); return &__offload->offload; } struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); refcount_inc(&__offload->users); return offload; } EXPORT_SYMBOL_GPL(taprio_offload_get); void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); if (!refcount_dec_and_test(&__offload->users)) return; kfree(__offload); } EXPORT_SYMBOL_GPL(taprio_offload_free); /* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) { u32 i, queue_mask = 0; for (i = 0; i < dev->num_tc; i++) { u32 offset, count; if (!(tc_mask & BIT(i))) continue; offset = dev->tc_to_txq[i].offset; count = dev->tc_to_txq[i].count; queue_mask |= GENMASK(offset + count - 1, offset); } return queue_mask; } static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, struct tc_taprio_qopt_offload *offload, const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; offload->base_time = sched->base_time; offload->cycle_time = sched->cycle_time; offload->cycle_time_extension = sched->cycle_time_extension; list_for_each_entry(entry, &sched->entries, list) { struct tc_taprio_sched_entry *e = &offload->entries[i]; e->command = entry->command; e->interval = entry->interval; if (caps->gate_mask_per_txq) e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); else e->gate_mask = entry->gate_mask; i++; } offload->num_entries = i; } static void taprio_detect_broken_mqprio(struct taprio_sched *q) { struct net_device *dev = qdisc_dev(q->root); struct tc_taprio_caps caps; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); q->broken_mqprio = caps.broken_mqprio; if (q->broken_mqprio) static_branch_inc(&taprio_have_broken_mqprio); else static_branch_inc(&taprio_have_working_mqprio); q->detected_mqprio = true; } static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) { if (!q->detected_mqprio) return; if (q->broken_mqprio) static_branch_dec(&taprio_have_broken_mqprio); else static_branch_dec(&taprio_have_working_mqprio); } static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; struct tc_taprio_caps caps; int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support taprio offload"); return -EOPNOTSUPP; } qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); if (!caps.supports_queue_max_sdu) { for (tc = 0; tc < TC_MAX_QUEUE; tc++) { if (q->max_sdu[tc]) { NL_SET_ERR_MSG_MOD(extack, "Device does not handle queueMaxSDU"); return -EOPNOTSUPP; } } } offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory for enabling offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done; } q->offloaded = true; done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid */ offload->extack = NULL; offload->mqprio.extack = NULL; taprio_offload_free(offload); return err; } static int taprio_disable_offload(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; int err; if (!q->offloaded) return 0; offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out; } q->offloaded = false; out: taprio_offload_free(offload); return err; } /* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory. */ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; struct kernel_ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out; } if (ops && ops->get_ts_info) err = ops->get_ts_info(dev, &info); if (err || info.phc_index < 0) { NL_SET_ERR_MSG(extack, "Device does not have a PTP clock"); err = -ENOTSUPP; goto out; } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. */ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) { NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); err = -ENOTSUPP; goto out; } switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } /* This pairs with READ_ONCE() in taprio_mono_to_any */ WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } /* Everything went ok, return success. */ err = 0; out: return err; } static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); int err, tc; u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); if (err < 0) return err; if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) { NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE; } max_sdu[tc] = val; } if (tb[TCA_TAPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } static int taprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; fp[tc] = q->fp[tc]; } nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) return err; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; q->fp[tc] = fp[tc]; if (fp[tc] != TC_FP_EXPRESS) have_preemption = true; } if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP; } if (!ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } } return err; } static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { int i; if (!mqprio || mqprio->num_tc != dev->num_tc) return -1; for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] || dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1; for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1; return 0; } static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; u32 taprio_flags; ktime_t start; int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); if (err < 0) return err; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); /* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0); /* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); return -EINVAL; } if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) { NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP; } q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ taprio_set_picos_per_byte(dev, q); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err; new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; } INIT_LIST_HEAD(&new_admin->entries); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) mqprio = NULL; if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; goto free_sched; } if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); q->cur_txq[i] = mqprio->offset[i]; } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; if (new_admin->num_entries == 0) { NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); err = -EINVAL; goto free_sched; } err = taprio_parse_clockid(sch, tb, extack); if (err < 0) goto free_sched; taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); err = -EINVAL; goto unlock; } q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock; } setup_txtime(q, new_admin, start); if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; new_admin = NULL; goto unlock; } /* Not going to race against advance_sched(), but still */ admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); taprio_start_sched(sch, start, new_admin); admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } new_admin = NULL; err = 0; if (!stab) NL_SET_ERR_MSG_MOD(extack, "Size table not specified, frame length estimations may be inaccurate"); unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: if (new_admin) call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } static void taprio_reset(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i; hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) qdisc_reset(q->qdiscs[i]); } } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; unsigned int i; list_del(&q->taprio_list); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); } q->qdiscs = NULL; netdev_reset_tc(dev); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper) call_rcu(&oper->rcu, taprio_free_sched_cb); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i, tc; spin_lock_init(&q->current_entry_lock); hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; list_add(&q->taprio_list, &taprio_list); if (sch->parent != TC_H_ROOT) { NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; } if (!netif_is_multiqueue(dev)) { NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; } q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); if (!q->qdiscs) return -ENOMEM; if (!opt) return -EINVAL; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; if (i < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); q->qdiscs[i] = qdisc; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->fp[tc] = TC_FP_EXPRESS; taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); } static void taprio_attach(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; /* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ dev_queue_qdisc = sch; } old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto */ qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int taprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children. */ *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new) qdisc_refcount_inc(new); if (*old) qdisc_put(*old); } q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_entry(struct sk_buff *msg, const struct sched_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, entry->gate_mask)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; return nla_nest_end(msg, item); nla_put_failure: nla_nest_cancel(msg, item); return -1; } static int dump_schedule(struct sk_buff *msg, const struct sched_gate_list *root) { struct nlattr *entry_list; struct sched_entry *entry; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, root->base_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, root->cycle_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, root->cycle_time_extension, TCA_TAPRIO_PAD)) return -1; entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) goto error_nest; list_for_each_entry(entry, &root->entries, list) { if (dump_entry(msg, entry) < 0) goto error_nest; } nla_nest_end(msg, entry_list); return 0; error_nest: nla_nest_cancel(msg, entry_list); return -1; } static int taprio_dump_tc_entries(struct sk_buff *skb, struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; int tc; for (tc = 0; tc < TC_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, sched->max_sdu[tc])) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == TAPRIO_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) return -EMSGSIZE; return 0; } static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, struct tc_taprio_qopt_offload *offload, struct tc_taprio_qopt_stats *stats) { struct net_device *dev = qdisc_dev(sch); const struct net_device_ops *ops; struct sk_buff *skb = d->skb; struct nlattr *xstats; int err; ops = qdisc_dev(sch)->netdev_ops; /* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself */ if (!ops->ndo_setup_tc) return 0; memset(stats, 0xff, sizeof(*stats)); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err; xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err; if (taprio_put_stat(skb, stats->window_drops, TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || taprio_put_stat(skb, stats->tx_overruns, TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel; nla_nest_end(skb, xstats); return 0; err_cancel: nla_nest_cancel(skb, xstats); err: return -EMSGSIZE; } static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_STATS, }; return taprio_dump_xstats(sch, d, &offload, &offload.stats); } static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) goto options_error; if (q->txtime_delay && nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; rcu_read_lock(); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error_rcu; if (oper && dump_schedule(skb, oper)) goto options_error_rcu; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) goto options_error_rcu; if (dump_schedule(skb, admin)) goto admin_error; nla_nest_end(skb, sched_nest); done: rcu_read_unlock(); return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); options_error_rcu: rcu_read_unlock(); options_error: nla_nest_cancel(skb, nest); start_error: return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!taprio_queue_get(sch, ntx)) return 0; return ntx; } static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = child->handle; return 0; } static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops taprio_class_ops = { .graft = taprio_graft, .leaf = taprio_leaf, .find = taprio_find, .walk = taprio_walk, .dump = taprio_dump_class, .dump_stats = taprio_dump_class_stats, .select_queue = taprio_select_queue, }; static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .cl_ops = &taprio_class_ops, .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("taprio"); static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, }; static int __init taprio_module_init(void) { int err = register_netdevice_notifier(&taprio_device_notifier); if (err) return err; return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Time Aware Priority qdisc");
243 244 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ #define pr_fmt(fmt) "genirq: " fmt #include <linux/irq.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> #include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> #include "internals.h" #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) DEFINE_STATIC_KEY_FALSE(force_irqthreads_key); static int __init setup_forced_irqthreads(char *arg) { static_branch_enable(&force_irqthreads_key); return 0; } early_param("threadirqs", setup_forced_irqthreads); #endif static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { unsigned long flags; /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); inprogress = irqd_irq_inprogress(&desc->irq_data); /* * If requested and supported, check at the chip whether it * is in flight at the hardware level, i.e. already pending * in a CPU and waiting for service and acknowledge. */ if (!inprogress && sync_chip) { /* * Ignore the return code. inprogress is only updated * when the chip supports it. */ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ } while (inprogress); } /** * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending hard IRQ handlers for this * interrupt to complete before returning. If you use this * function while holding a resource the IRQ handler may need you * will deadlock. It does not take associated threaded handlers * into account. * * Do not use this for shutdown scenarios where you must be sure * that all parts (hardirq and threaded handler) have completed. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. * * It does not check whether there is an interrupt in flight at the * hardware level, but not serviced yet, as this might deadlock when * called with interrupts disabled and the target CPU of the interrupt * is the current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } return true; } EXPORT_SYMBOL(synchronize_hardirq); static void __synchronize_irq(struct irq_desc *desc) { __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is running. Now verify that no * threaded handlers are active. */ wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * * It optionally makes sure (when the irq chip supports that method) * that the interrupt is not pending in any CPU and waiting for * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; static bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return false; return true; } /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) { return __irq_can_set_affinity(irq_to_desc(irq)); } /** * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space * @irq: Interrupt to check * * Like irq_can_set_affinity() above, but additionally checks for the * AFFINITY_MANAGED flag. */ bool irq_can_set_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return __irq_can_set_affinity(desc) && !irqd_affinity_is_managed(&desc->irq_data); } /** * irq_set_thread_affinity - Notify irq threads to adjust affinity * @desc: irq descriptor which has affinity changed * * We just set IRQTF_AFFINITY and delegate the affinity setting * to the interrupt thread itself. We can not call * set_cpus_allowed_ptr() here as we hold desc->lock and this * code can be called from hard interrupt context. */ static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; for_each_action_of_desc(desc, action) { if (action->thread) { set_bit(IRQTF_AFFINITY, &action->thread_flags); wake_up_process(action->thread); } if (action->secondary && action->secondary->thread) { set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags); wake_up_process(action->secondary->thread); } } } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); if (!cpumask_empty(m)) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); } #else static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif static DEFINE_PER_CPU(struct cpumask, __tmp_mask); int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; if (!chip || !chip->irq_set_affinity) return -EINVAL; /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with * a housekeeping CPU. If so, then remove the isolated CPUs from * the mask and just keep the housekeeping CPU(s). This prevents * the affinity setter from routing the interrupt to an isolated * CPU to avoid that I/O submitted from a housekeeping CPU causes * interrupts on an isolated one. * * If the masks do not intersect or include online CPU(s) then * keep the requested mask. The isolated target CPUs are only * receiving interrupts when the I/O operation was submitted * directly from them. * * If all housekeeping CPUs in the affinity mask are offline, the * interrupt will be migrated by the CPU hotplug code once a * housekeeping CPU which belongs to the affinity mask comes * online. */ if (irqd_affinity_is_managed(data) && housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask; hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); cpumask_and(tmp_mask, mask, hk_mask); if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else prog_mask = tmp_mask; } else { prog_mask = mask; } /* * Make sure we only provide online CPUs to the irqchip, * unless we are being asked to force the affinity (in which * case we do as we are told). */ cpumask_and(tmp_mask, prog_mask, cpu_online_mask); if (!force && !cpumask_empty(tmp_mask)) ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); ret = 0; } return ret; } #ifdef CONFIG_GENERIC_PENDING_IRQ static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { struct irq_desc *desc = irq_data_to_desc(data); irqd_set_move_pending(data); irq_copy_pending(desc, dest); return 0; } #else static inline int irq_set_affinity_pending(struct irq_data *data, const struct cpumask *dest) { return -EBUSY; } #endif static int irq_try_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force) { int ret = irq_do_set_affinity(data, dest, force); /* * In case that the underlying vector management is busy and the * architecture supports the generic pending mechanism then utilize * this to avoid returning an error to user space. */ if (ret == -EBUSY && !force) ret = irq_set_affinity_pending(data, dest); return ret; } static bool irq_set_affinity_deactivated(struct irq_data *data, const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); /* * Handle irq chips which can handle affinity only in activated * state correctly * * If the interrupt is not yet activated, just store the affinity * mask and do not call the chip driver at all. On activation the * driver has to make sure anyway that the interrupt is in a * usable state so startup works. */ if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data) || !irqd_affinity_on_activate(data)) return false; cpumask_copy(desc->irq_common_data.affinity, mask); irq_data_update_effective_affinity(data, mask); irqd_set(data, IRQD_AFFINITY_SET); return true; } int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; if (!chip || !chip->irq_set_affinity) return -EINVAL; if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); } if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); if (!schedule_work(&desc->affinity_notify->work)) { /* Work was already scheduled, drop our extra ref */ kref_put(&desc->affinity_notify->kref, desc->affinity_notify->release); } } irqd_set(data, IRQD_AFFINITY_SET); return ret; } /** * irq_update_affinity_desc - Update affinity management for an interrupt * @irq: The interrupt number to update * @affinity: Pointer to the affinity descriptor * * This interface can be used to configure the affinity management of * interrupts which have been allocated already. * * There are certain limitations on when it may be used - attempts to use it * for when the kernel is configured for generic IRQ reservation mode (in * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with * managed/non-managed interrupt accounting. In addition, attempts to use it on * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { struct irq_desc *desc; unsigned long flags; bool activated; int ret = 0; /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. */ if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return -EINVAL; /* Requires the interrupt to be shut down */ if (irqd_is_started(&desc->irq_data)) { ret = -EBUSY; goto out_unlock; } /* Interrupts which are already managed cannot be modified */ if (irqd_affinity_is_managed(&desc->irq_data)) { ret = -EBUSY; goto out_unlock; } /* * Deactivate the interrupt. That's required to undo * anything an earlier activation has established. */ activated = irqd_is_activated(&desc->irq_data); if (activated) irq_domain_deactivate_irq(&desc->irq_data); if (affinity->is_managed) { irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); } cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); /* Restore the activation state */ if (activated) irq_domain_activate_irq(&desc->irq_data, false); out_unlock: irq_put_desc_busunlock(desc, flags); return ret; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; if (!desc) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Fails if cpumask does not contain an online CPU */ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, false); } EXPORT_SYMBOL_GPL(irq_set_affinity); /** * irq_force_affinity - Force the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * * Same as irq_set_affinity, but without checking the mask against * online cpus. * * Solely for low level cpu hotplug code, where we need to make per * cpu interrupts affine before the cpu becomes online. */ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return __irq_set_affinity(irq, cpumask, true); } EXPORT_SYMBOL_GPL(irq_force_affinity); int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); if (m && setaffinity) __irq_set_affinity(irq, m, false); return 0; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; unsigned long flags; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; raw_spin_lock_irqsave(&desc->lock, flags); if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_common_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); free_cpumask_var(cpumask); out: kref_put(&notify->kref, notify->release); } /** * irq_set_affinity_notifier - control notification of IRQ affinity changes * @irq: Interrupt for which to enable/disable notification * @notify: Context for notification, or %NULL to disable * notification. Function pointers must be initialised; * the other fields will be initialised by this function. * * Must be called in process context. Notification may only be enabled * after the IRQ is allocated and must be disabled before the IRQ is * freed using free_irq(). */ int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; unsigned long flags; /* The release function is promised process context */ might_sleep(); if (!desc || irq_is_nmi(desc)) return -EINVAL; /* Complete initialisation of *notify */ if (notify) { notify->irq = irq; kref_init(&notify->kref); INIT_WORK(&notify->work, irq_affinity_notify); } raw_spin_lock_irqsave(&desc->lock, flags); old_notify = desc->affinity_notify; desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { if (cancel_work_sync(&old_notify->work)) { /* Pending work had a ref, put that one too */ kref_put(&old_notify->kref, old_notify->release); } kref_put(&old_notify->kref, old_notify->release); } return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. */ int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int ret, node = irq_desc_get_node(desc); static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; raw_spin_lock(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. */ if (irqd_affinity_is_managed(&desc->irq_data) || irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_common_data.affinity, cpu_online_mask)) set = desc->irq_common_data.affinity; else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } cpumask_and(&mask, cpu_online_mask, set); if (cpumask_empty(&mask)) cpumask_copy(&mask, cpu_online_mask); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } ret = irq_do_set_affinity(&desc->irq_data, &mask, false); raw_spin_unlock(&mask_lock); return ret; } #else /* Wrapper for ALPHA specific affinity selector magic */ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* CONFIG_AUTO_IRQ_AFFINITY */ #endif /* CONFIG_SMP */ /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt * @irq: interrupt number to set affinity * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU * specific data for percpu_devid interrupts * * This function uses the vCPU specific data to set the vCPU * affinity for an irq. The vCPU specific data is passed from * outside, such as KVM. One example code path is as below: * KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); struct irq_data *data; struct irq_chip *chip; int ret = -ENOSYS; if (!desc) return -EINVAL; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (chip && chip->irq_set_vcpu_affinity) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) ret = chip->irq_set_vcpu_affinity(data, vcpu_info); irq_put_desc_unlock(desc, flags); return ret; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); void __disable_irq(struct irq_desc *desc) { if (!desc->depth++) irq_disable(desc); } static int __disable_irq_nosync(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; __disable_irq(desc); irq_put_desc_busunlock(desc, flags); return 0; } /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and Enables are * nested. * Unlike disable_irq(), this function does not ensure existing * instances of the IRQ handler have completed before returning. * * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); /** * disable_irq - disable an irq and wait for completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are * nested. * This function waits for any pending IRQ handlers for this interrupt * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * * Can only be called from preemptible code as it might sleep when * an interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) { might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); /** * disable_hardirq - disables an irq and waits for hardirq completion * @irq: Interrupt to disable * * Disable the selected interrupt line. Enables and Disables are * nested. * This function waits for any pending hard IRQ handlers for this * interrupt to complete before returning. If you use this function while * holding a resource the hard IRQ handler may need you will deadlock. * * When used to optimistically disable an interrupt from atomic context * the return value must be checked. * * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** * disable_nmi_nosync - disable an nmi without waiting * @irq: Interrupt to disable * * Disable the selected interrupt line. Disables and enables are * nested. * The interrupt to disable must have been requested through request_nmi. * Unlike disable_nmi(), this function does not ensure existing * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { disable_irq_nosync(irq); } void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { case 0: err_out: WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq_desc_get_irq(desc)); break; case 1: { if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); /* * Call irq_startup() not irq_enable() here because the * interrupt might be marked NOAUTOEN so irq_startup() * needs to be invoked when it gets enabled the first time. * This is also required when __enable_irq() is invoked for * a managed and shutdown interrupt from the S3 resume * path. * * If it was already started up, then irq_startup() will * invoke irq_enable() under the hood. */ irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); break; } default: desc->depth--; } } /** * enable_irq - enable handling of an irq * @irq: Interrupt to enable * * Undoes the effect of one call to disable_irq(). If this * matches the last disable, processing of interrupts on this * IRQ line is re-enabled. * * This function may be called from IRQ context only when * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; if (WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; __enable_irq(desc); out: irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL(enable_irq); /** * enable_nmi - enable handling of an nmi * @irq: Interrupt to enable * * The interrupt to enable must have been requested through request_nmi. * Undoes the effect of one call to disable_nmi(). If this * matches the last disable, processing of interrupts on this * IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { enable_irq(irq); } static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } /** * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * * Enable/disable power management wakeup mode, which is * disabled by default. Enables and disables must match, * just as they match for non-wakeup mode support. * * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". * * Note: irq enable/disable state is completely orthogonal * to the enable/disable state of irq wake. An irq can be * disabled with disable_irq() and still wake the system as * long as the irq has wake enabled. If this does not hold, * then the underlying irq chip and the related driver need * to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) return -EINVAL; /* Don't use NMIs as wake up interrupts please */ if (irq_is_nmi(desc)) { ret = -EINVAL; goto out_unlock; } /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 0; else irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { WARN(1, "Unbalanced IRQ %d wake disable\n", irq); } else if (--desc->wake_depth == 0) { ret = set_irq_wake_real(irq, on); if (ret) desc->wake_depth = 1; else irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } out_unlock: irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. */ int can_request_irq(unsigned int irq, unsigned long irqflags) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); int canrequest = 0; if (!desc) return 0; if (irq_settings_can_request(desc)) { if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) canrequest = 1; } irq_put_desc_unlock(desc, flags); return canrequest; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) { struct irq_chip *chip = desc->irq_data.chip; int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ pr_debug("No set_type function for IRQ %d (%s)\n", irq_desc_get_irq(desc), chip ? (chip->name ? : "unknown") : "unknown"); return 0; } if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } /* Mask all flags except trigger mode */ flags &= IRQ_TYPE_SENSE_MASK; ret = chip->irq_set_type(&desc->irq_data, flags); switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); fallthrough; case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); irq_settings_set_trigger_mask(desc, flags); irqd_clear(&desc->irq_data, IRQD_LEVEL); irq_settings_clr_level(desc); if (flags & IRQ_TYPE_LEVEL_MASK) { irq_settings_set_level(desc); irqd_set(&desc->irq_data, IRQD_LEVEL); } ret = 0; break; default: pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) unmask_irq(desc); return ret; } #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; desc->parent_irq = parent_irq; irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* * Default primary interrupt handler for threaded interrupts. Is * assigned as primary handler when request_threaded_irq is called * with handler == NULL. Useful for oneshot interrupts. */ static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) { return IRQ_WAKE_THREAD; } /* * Primary handler for nested threaded interrupts. Should never be * called. */ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) { WARN(1, "Primary handler called for nested irq %d\n", irq); return IRQ_NONE; } static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) { WARN(1, "Secondary action handler called for irq %d\n", irq); return IRQ_NONE; } #ifdef CONFIG_SMP /* * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; bool valid = false; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; __set_current_state(TASK_RUNNING); /* * In case we are out of memory we set IRQTF_AFFINITY again and * try again next time */ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { set_bit(IRQTF_AFFINITY, &action->thread_flags); return; } raw_spin_lock_irq(&desc->lock); /* * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ if (cpumask_available(desc->irq_common_data.affinity)) { const struct cpumask *m; m = irq_data_get_effective_affinity_mask(&desc->irq_data); cpumask_copy(mask, m); valid = true; } raw_spin_unlock_irq(&desc->lock); if (valid) set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif static int irq_wait_for_interrupt(struct irq_desc *desc, struct irqaction *action) { for (;;) { set_current_state(TASK_INTERRUPTIBLE); irq_thread_check_affinity(desc, action); if (kthread_should_stop()) { /* may need to run one last time */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } __set_current_state(TASK_RUNNING); return -1; } if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; } schedule(); } } /* * Oneshot interrupts keep the irq line masked until the threaded * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ static void irq_finalize_oneshot(struct irq_desc *desc, struct irqaction *action) { if (!(desc->istate & IRQS_ONESHOT) || action->handler == irq_forced_secondary_handler) return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* * Implausible though it may be we need to protect us against * the following scenario: * * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. * * This also serializes the state of shared oneshot handlers * versus "desc->threads_oneshot |= action->thread_mask;" in * irq_wake_thread(). See the comment there which explains the * serialization. */ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); goto again; } /* * Now check again, whether the thread should run. Otherwise * we would clear the threads_oneshot bit of this thread which * was just set. */ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) goto out_unlock; desc->threads_oneshot &= ~action->thread_mask; if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } /* * Interrupts explicitly requested as threaded interrupts want to be * preemptible - many of them need to sleep and wait for slow busses to * complete. */ static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); return ret; } /* * Interrupts which are not explicitly requested as threaded * interrupts rely on the implicit bh/preempt disable of the hard irq * context. So we need to disable bh here to avoid deadlocks and other * side effects. */ static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; local_bh_disable(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_disable(); ret = irq_thread_fn(desc, action); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_irq_enable(); local_bh_enable(); return ret; } void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } static void irq_thread_dtor(struct callback_head *unused) { struct task_struct *tsk = current; struct irq_desc *desc; struct irqaction *action; if (WARN_ON_ONCE(!(current->flags & PF_EXITING))) return; action = kthread_data(tsk); pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm, tsk->pid, action->irq); desc = irq_to_desc(action->irq); /* * If IRQTF_RUNTHREAD is set, we need to decrement * desc->threads_active and wake possible waiters. */ if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) wake_threads_waitq(desc); /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action); } static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) { struct irqaction *secondary = action->secondary; if (WARN_ON_ONCE(!secondary)) return; raw_spin_lock_irq(&desc->lock); __irq_wake_thread(desc, secondary); raw_spin_unlock_irq(&desc->lock); } /* * Internal function to notify that a interrupt thread is ready. */ static void irq_thread_set_ready(struct irq_desc *desc, struct irqaction *action) { set_bit(IRQTF_READY, &action->thread_flags); wake_up(&desc->wait_for_threads); } /* * Internal function to wake up a interrupt thread and wait until it is * ready. */ static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc, struct irqaction *action) { if (!action || !action->thread) return; wake_up_process(action->thread); wait_event(desc->wait_for_threads, test_bit(IRQTF_READY, &action->thread_flags)); } /* * Interrupt handler thread */ static int irq_thread(void *data) { struct callback_head on_exit_work; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); irq_thread_set_ready(desc, action); sched_set_fifo(current); if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); task_work_add(current, &on_exit_work, TWA_NONE); while (!irq_wait_for_interrupt(desc, action)) { irqreturn_t action_ret; action_ret = handler_fn(desc, action); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); wake_threads_waitq(desc); } /* * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the * oneshot mask bit can be set. */ task_work_cancel_func(current, irq_thread_dtor); return 0; } /** * irq_wake_thread - wake the irq thread for the action identified by dev_id * @irq: Interrupt line * @dev_id: Device identity for which the thread should be woken * */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(irq_wake_thread); static int irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads()) return 0; if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; /* * No further action required for interrupts which are requested as * threaded interrupts already */ if (new->handler == irq_default_primary_handler) return 0; new->flags |= IRQF_ONESHOT; /* * Handle the case where we have a real primary handler and a * thread handler. We force thread them as well by creating a * secondary action. */ if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) return -ENOMEM; new->secondary->handler = irq_forced_secondary_handler; new->secondary->thread_fn = new->thread_fn; new->secondary->dev_id = new->dev_id; new->secondary->irq = new->irq; new->secondary->name = new->name; } /* Deal with the primary handler */ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); new->thread_fn = new->handler; new->handler = irq_default_primary_handler; return 0; } static int irq_request_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; return c->irq_request_resources ? c->irq_request_resources(d) : 0; } static void irq_release_resources(struct irq_desc *desc) { struct irq_data *d = &desc->irq_data; struct irq_chip *c = d->chip; if (c->irq_release_resources) c->irq_release_resources(d); } static bool irq_supports_nmi(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* Only IRQs directly managed by the root irqchip can be set as NMI */ if (d->parent_data) return false; #endif /* Don't support NMIs for chips behind a slow bus */ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) return false; return d->chip->flags & IRQCHIP_SUPPORTS_NMI; } static int irq_nmi_setup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; } static void irq_nmi_teardown(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *c = d->chip; if (c->irq_nmi_teardown) c->irq_nmi_teardown(d); } static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name); } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); } if (IS_ERR(t)) return PTR_ERR(t); /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code * references an already freed task_struct. */ new->thread = get_task_struct(t); /* * Tell the thread to set its affinity. This is * important for shared interrupt handlers as we do * not invoke setup_affinity() for the secondary * handlers as everything is already set up. Even for * interrupts marked with IRQF_NO_BALANCE this is * correct as we want the thread to move to the cpu(s) * on which the requesting code placed the interrupt. */ set_bit(IRQTF_AFFINITY, &new->thread_flags); return 0; } /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. * * Locking rules: * * desc->request_mutex Provides serialization against a concurrent free_irq() * chip_bus_lock Provides serialization for slow bus operations * desc->lock Provides serialization against hard interrupts * * chip_bus_lock and desc->lock are sufficient for all other management and * interrupt related functions. desc->request_mutex solely serializes * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; if (!desc) return -EINVAL; if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; new->irq = irq; /* * If the trigger type is not specified by the caller, * then use the default for this interrupt. */ if (!(new->flags & IRQF_TRIGGER_MASK)) new->flags |= irqd_get_trigger_type(&desc->irq_data); /* * Check whether the interrupt nests into another interrupt * thread. */ nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) { ret = -EINVAL; goto out_mput; } /* * Replace the primary handler which was provided from * the driver for non nested interrupt handling by the * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; } else { if (irq_settings_can_thread(desc)) { ret = irq_setup_forced_threading(new); if (ret) goto out_mput; } } /* * Create a handler thread when a thread function is supplied * and the interrupt does not nest into another interrupt * thread. */ if (new->thread_fn && !nested) { ret = setup_irq_thread(new, irq, false); if (ret) goto out_mput; if (new->secondary) { ret = setup_irq_thread(new->secondary, irq, true); if (ret) goto out_thread; } } /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a * threaded irq without a primary hard irq context handler * requires the ONESHOT flag to be set. Some irq chips like * MSI based interrupts are per se one shot safe. Check the * chip flags, so we can avoid the unmask dance at the end of * the threaded handler for those. */ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; /* * Protects against a concurrent __free_irq() call which might wait * for synchronize_hardirq() to complete without holding the optional * chip bus lock and desc->lock. Also protects against handing out * a recycled oneshot thread_mask bit while it's still in use by * its previous owner. */ mutex_lock(&desc->request_mutex); /* * Acquire bus lock as the irq_request_resources() callback below * might rely on the serialization or the magic power management * functions which are abusing the irq_bus_lock() callback, */ chip_bus_lock(desc); /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); goto out_bus_unlock; } } /* * The following block of code has to be executed atomically * protected against a concurrent interrupt and any of the other * management calls which are not serialized via * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { /* * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; if (irq_is_nmi(desc)) { pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", new->name, irq, desc->irq_data.chip->name); ret = -EINVAL; goto out_unlock; } /* * If nobody did set the configuration before, inherit * the one provided by the requester. */ if (irqd_trigger_type_was_set(&desc->irq_data)) { oldtype = irqd_get_trigger_type(&desc->irq_data); } else { oldtype = new->flags & IRQF_TRIGGER_MASK; irqd_set_trigger_type(&desc->irq_data, oldtype); } if (!((old->flags & new->flags) & IRQF_SHARED) || (oldtype != (new->flags & IRQF_TRIGGER_MASK))) goto mismatch; if ((old->flags & IRQF_ONESHOT) && (new->flags & IRQF_COND_ONESHOT)) new->flags |= IRQF_ONESHOT; else if ((old->flags ^ new->flags) & IRQF_ONESHOT) goto mismatch; /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; /* add new interrupt at end of irq queue */ do { /* * Or all existing action->thread_mask bits, * so we can find the next zero bit for this * new action. */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } /* * Setup the thread mask for this irqaction for ONESHOT. For * !ONESHOT irqs the thread mask is 0 so we can avoid a * conditional in irq_wake_thread(). */ if (new->flags & IRQF_ONESHOT) { /* * Unlikely to have 32 resp 64 irqs sharing one line, * but who knows. */ if (thread_mask == ~0UL) { ret = -EBUSY; goto out_unlock; } /* * The thread_mask for the action is or'ed to * desc->thread_active to indicate that the * IRQF_ONESHOT thread handler has been woken, but not * yet finished. The bit is cleared when a thread * completes. When all threads of a shared interrupt * line have completed desc->threads_active becomes * zero and the interrupt line is unmasked. See * handle.c:irq_wake_thread() for further information. * * If no thread is woken by primary (hard irq context) * interrupt handlers, then desc->threads_active is * also checked for zero to unmask the irq line in the * affected hard irq flow handlers * (handle_[fasteoi|level]_irq). * * The new action gets the first zero bit of * thread_mask assigned. See the loop above which or's * all existing action->thread_mask bits. */ new->thread_mask = 1UL << ffz(thread_mask); } else if (new->handler == irq_default_primary_handler && !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it * does not have the oneshot flag set. In combination * with level interrupts this is deadly, because the * default primary handler just wakes the thread, then * the irq lines is reenabled, but the device still * has the level irq asserted. Rinse and repeat.... * * While this works for edge type interrupts, we play * it safe and reject unconditionally because we can't * say for sure which type this interrupt really * has. The type flags are unreliable as the * underlying chip implementation can override them. */ pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", new->name, irq); ret = -EINVAL; goto out_unlock; } if (!shared) { /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); if (ret) goto out_unlock; } /* * Activate the interrupt. That activation must happen * independently of IRQ_NOAUTOEN. request_irq() can fail * and the callers are supposed to handle * that. enable_irq() of an interrupt requested with * IRQ_NOAUTOEN is not supposed to fail. The activation * keeps it in shutdown mode, it merily associates * resources if necessary and if that's not possible it * fails. Interrupts which are in managed shutdown mode * will simply ignore that activation request. */ ret = irq_activate(desc); if (ret) goto out_unlock; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_ONESHOT | IRQS_WAITING); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); irq_settings_set_per_cpu(desc); if (new->flags & IRQF_NO_DEBUG) irq_settings_set_no_debug(desc); } if (noirqdebug) irq_settings_set_no_debug(desc); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { irq_settings_set_no_balancing(desc); irqd_set(&desc->irq_data, IRQD_NO_BALANCING); } if (!(new->flags & IRQF_NO_AUTOEN) && irq_settings_can_autoenable(desc)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling * auto enable. The sharing interrupt might request * it while it's still disabled and then wait for * interrupts forever. */ WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; } } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", irq, omsk, nmsk); } *old_ptr = new; irq_pm_install_action(desc, new); /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; /* * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new); wake_up_and_wait_for_irq_thread_ready(desc, new->secondary); register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); return 0; mismatch: if (!(new->flags & IRQF_PROBE_SHARED)) { pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", irq, new->flags, new->name, old->flags, old->name); #ifdef CONFIG_DEBUG_SHIRQ dump_stack(); #endif } ret = -EBUSY; out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); if (!desc->action) irq_release_resources(desc); out_bus_unlock: chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: if (new->thread) { struct task_struct *t = new->thread; new->thread = NULL; kthread_stop_put(t); } if (new->secondary && new->secondary->thread) { struct task_struct *t = new->secondary->thread; new->secondary->thread = NULL; kthread_stop_put(t); } out_mput: module_put(desc->owner); return ret; } /* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right * one based on the dev_id: */ action_ptr = &desc->action; for (;;) { action = *action_ptr; if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); return NULL; } if (action->dev_id == dev_id) break; action_ptr = &action->next; } /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; irq_pm_remove_action(desc, action); /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ if (WARN_ON_ONCE(desc->affinity_hint)) desc->affinity_hint = NULL; #endif raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Drop bus_lock here so the changes which were done in the chip * callbacks above are synced out to the irq chips which hang * behind a slow bus (I2C, SPI) before calling synchronize_hardirq(). * * Aside of that the bus_lock can also be taken from the threaded * handler in irq_finalize_oneshot() which results in a deadlock * because kthread_stop() would wait forever for the thread to * complete, which is blocked on the bus lock. * * The still held desc->request_mutex() protects against a * concurrent request_irq() of this irq so the release of resources * and timing data is properly serialized. */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); /* * Make sure it's not being used on another CPU and if the chip * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ * event to happen even now it's being freed, so let's make sure that * is so by doing an extra call to the handler .... * * ( We do this after actually deregistering it, to make sure that a * 'real' IRQ doesn't run in parallel with our fake. ) */ if (action->flags & IRQF_SHARED) { local_irq_save(flags); action->handler(irq, dev_id); local_irq_restore(flags); } #endif /* * The action has already been removed above, but the thread writes * its oneshot mask bit when it completes. Though request_mutex is * held across this which prevents __setup_irq() from handing out * the same bit to a newly requested action. */ if (action->thread) { kthread_stop_put(action->thread); if (action->secondary && action->secondary->thread) kthread_stop_put(action->secondary->thread); } /* Last action releases resources */ if (!desc->action) { /* * Reacquire bus lock as irq_release_resources() might * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); /* * There is no interrupt on the fly anymore. Deactivate it * completely. */ raw_spin_lock_irqsave(&desc->lock, flags); irq_domain_deactivate_irq(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); } mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); return action; } /** * free_irq - free an interrupt allocated with request_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove an interrupt handler. The handler is removed and if the * interrupt line is no longer in use by any driver it is disabled. * On a shared IRQ the caller must ensure the interrupt is disabled * on the card it drives before calling this function. The function * does not return until any executing interrupts for this IRQ * have completed. * * This function must not be called from interrupt context. * * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif action = __free_irq(desc, dev_id); if (!action) return NULL; devname = action->name; kfree(action); return devname; } EXPORT_SYMBOL(free_irq); /* This function must be called with desc->lock held */ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) { const char *devname = NULL; desc->istate &= ~IRQS_NMI; if (!WARN_ON(desc->action == NULL)) { irq_pm_remove_action(desc, desc->action); devname = desc->action->name; unregister_handler_proc(irq, desc->action); kfree(desc->action); desc->action = NULL; } irq_settings_clr_disable_unlazy(desc); irq_shutdown_and_deactivate(desc); irq_release_resources(desc); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return devname; } const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; const void *devname; if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) return NULL; /* NMI still enabled */ if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); raw_spin_lock_irqsave(&desc->lock, flags); irq_nmi_teardown(desc); devname = __cleanup_nmi(irq, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return devname; } /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts. * If handler is NULL and thread_fn != NULL * the default primary handler is installed. * @thread_fn: Function called from the irq handler thread * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt line and IRQ handling. From the point this * call is made your handler function may be invoked. Since * your handler function must clear any interrupt the board * raises, you must take care both to initialise your hardware * and to set up the interrupt handler in the right order. * * If you want to set up a threaded irq handler for your device * then you need to supply @handler and @thread_fn. @handler is * still called in hard interrupt context and has to check * whether the interrupt originates from the device. If yes it * needs to disable the interrupt on the device and return * IRQ_WAKE_THREAD which will wake up the handler thread and run * @thread_fn. This split handler design is necessary to support * shared interrupts. * * Dev_id must be globally unique. Normally the address of the * device data structure is used as the cookie. Since the handler * receives this value it makes sense to use it. * * If your interrupt is shared you must pass a non NULL dev_id * as this is required when freeing the interrupt. * * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level * IRQF_ONESHOT Run thread_fn with interrupt line masked */ int request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). * * Also shared interrupts do not go well with disabling auto enable. * The sharing interrupt might request it while it's still disabled * and then wait for interrupts forever. * * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and * it cannot be set along with IRQF_NO_SUSPEND. */ if (((irqflags & IRQF_SHARED) && !dev_id) || ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) || (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { if (!thread_fn) return -EINVAL; handler = irq_default_primary_handler; } action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action->secondary); kfree(action); } #ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... * We disable the irq to make sure that a 'real' IRQ doesn't * run in parallel with our fake. */ unsigned long flags; disable_irq(irq); local_irq_save(flags); handler(irq, dev_id); local_irq_restore(flags); enable_irq(irq); } #endif return retval; } EXPORT_SYMBOL(request_threaded_irq); /** * request_any_context_irq - allocate an interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @flags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt line and IRQ handling. It selects either a * hardirq or threaded handling method depending on the * context. * * On failure, it returns a negative value. On success, * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { struct irq_desc *desc; int ret; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; desc = irq_to_desc(irq); if (!desc) return -EINVAL; if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; } ret = request_irq(irq, handler, flags, name, dev_id); return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); /** * request_nmi - allocate an interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Threaded handler for threaded interrupts. * @irqflags: Interrupt type flags * @name: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt line and IRQ handling. It sets up the IRQ line * to be handled as an NMI. * * An interrupt line delivering NMIs cannot be shared and IRQ handling * cannot be threaded. * * Interrupt lines requested for NMI delivering must produce per cpu * interrupts and have auto enabling setting disabled. * * Dev_id must be globally unique. Normally the address of the * device data structure is used as the cookie. Since the handler * receives this value it makes sense to use it. * * If the interrupt line cannot be used to deliver NMIs, function * will fail and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; unsigned long flags; int retval; if (irq == IRQ_NOTCONNECTED) return -ENOTCONN; /* NMI cannot be shared, used for Polling */ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) return -EINVAL; if (!(irqflags & IRQF_PERCPU)) return -EINVAL; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || (irq_settings_can_autoenable(desc) && !(irqflags & IRQF_NO_AUTOEN)) || !irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(desc)) || !irq_supports_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; raw_spin_lock_irqsave(&desc->lock, flags); /* Setup NMI state */ desc->istate |= IRQS_NMI; retval = irq_nmi_setup(desc); if (retval) { __cleanup_nmi(irq, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return -EINVAL; } raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } void enable_percpu_irq(unsigned int irq, unsigned int type) { unsigned int cpu = smp_processor_id(); unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return; /* * If the trigger type is not specified by the caller, then * use the default for this interrupt. */ type &= IRQ_TYPE_SENSE_MASK; if (type == IRQ_TYPE_NONE) type = irqd_get_trigger_type(&desc->irq_data); if (type != IRQ_TYPE_NONE) { int ret; ret = __irq_set_trigger(desc, type); if (ret) { WARN(1, "failed to set type for IRQ%d\n", irq); goto out; } } irq_percpu_enable(desc, cpu); out: irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(enable_percpu_irq); void enable_percpu_nmi(unsigned int irq, unsigned int type) { enable_percpu_irq(irq, type); } /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for * * Must be called from a non migratable context. Returns the enable * state of a per cpu interrupt on the current cpu. */ bool irq_percpu_is_enabled(unsigned int irq) { unsigned int cpu = smp_processor_id(); struct irq_desc *desc; unsigned long flags; bool is_enabled; desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return false; is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); irq_put_desc_unlock(desc, flags); return is_enabled; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { unsigned int cpu = smp_processor_id(); unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return; irq_percpu_disable(desc, cpu); irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(disable_percpu_irq); void disable_percpu_nmi(unsigned int irq) { disable_percpu_irq(irq); } /* * Internal function to unregister a percpu irqaction. */ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; if (!action || action->percpu_dev_id != dev_id) { WARN(1, "Trying to free already-free IRQ %d\n", irq); goto bad; } if (!cpumask_empty(desc->percpu_enabled)) { WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", irq, cpumask_first(desc->percpu_enabled)); goto bad; } /* Found it - now remove it from the list of entries: */ desc->action = NULL; desc->istate &= ~IRQS_NMI; raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; bad: raw_spin_unlock_irqrestore(&desc->lock, flags); return NULL; } /** * remove_percpu_irq - free a per-cpu interrupt * @irq: Interrupt line to free * @act: irqaction for the interrupt * * Used to remove interrupts statically setup by the early boot process. */ void remove_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); if (desc && irq_settings_is_per_cpu_devid(desc)) __free_percpu_irq(irq, act->percpu_dev_id); } /** * free_percpu_irq - free an interrupt allocated with request_percpu_irq * @irq: Interrupt line to free * @dev_id: Device identity to free * * Remove a percpu interrupt handler. The handler is removed, but * the interrupt line is not disabled. This must be done on each * CPU before calling this function. The function does not return * until any executing interrupts for this IRQ have completed. * * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; chip_bus_lock(desc); kfree(__free_percpu_irq(irq, dev_id)); chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(free_percpu_irq); void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !irq_settings_is_per_cpu_devid(desc)) return; if (WARN_ON(!irq_is_nmi(desc))) return; kfree(__free_percpu_irq(irq, dev_id)); } /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ int setup_percpu_irq(unsigned int irq, struct irqaction *act) { struct irq_desc *desc = irq_to_desc(irq); int retval; if (!desc || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) return retval; retval = __setup_irq(irq, desc, act); if (retval) irq_chip_pm_put(&desc->irq_data); return retval; } /** * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources and enables the * interrupt on the local CPU. If the interrupt is supposed to be * enabled on other CPUs, it has to be done on each CPU using * enable_percpu_irq(). * * Dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; int retval; if (!dev_id) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; if (flags && flags != IRQF_TIMER) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) { kfree(action); return retval; } retval = __setup_irq(irq, desc, action); if (retval) { irq_chip_pm_put(&desc->irq_data); kfree(action); } return retval; } EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * @name: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs * have to be setup on each CPU by calling prepare_percpu_nmi() before * being enabled on the same CPU by using enable_percpu_nmi(). * * Dev_id must be globally unique. It is a per-cpu variable, and * the handler gets called with the interrupted CPU's instance of * that variable. * * Interrupt lines requested for NMI delivering should have auto enabling * setting disabled. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; unsigned long flags; int retval; if (!handler) return -EINVAL; desc = irq_to_desc(irq); if (!desc || !irq_settings_can_request(desc) || !irq_settings_is_per_cpu_devid(desc) || irq_settings_can_autoenable(desc) || !irq_supports_nmi(desc)) return -EINVAL; /* The line cannot already be NMI */ if (irq_is_nmi(desc)) return -EINVAL; action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD | IRQF_NOBALANCING; action->name = name; action->percpu_dev_id = dev_id; retval = irq_chip_pm_get(&desc->irq_data); if (retval < 0) goto err_out; retval = __setup_irq(irq, desc, action); if (retval) goto err_irq_setup; raw_spin_lock_irqsave(&desc->lock, flags); desc->istate |= IRQS_NMI; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: kfree(action); return retval; } /** * prepare_percpu_nmi - performs CPU local setup for NMI delivery * @irq: Interrupt line to prepare for NMI delivery * * This call prepares an interrupt line to deliver NMI on the current CPU, * before that interrupt line gets enabled with enable_percpu_nmi(). * * As a CPU local operation, this should be called from non-preemptible * context. * * If the interrupt line cannot be used to deliver NMIs, function * will fail returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { unsigned long flags; struct irq_desc *desc; int ret = 0; WARN_ON(preemptible()); desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return -EINVAL; if (WARN(!irq_is_nmi(desc), KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) { ret = -EINVAL; goto out; } ret = irq_nmi_setup(desc); if (ret) { pr_err("Failed to setup NMI delivery: irq %u\n", irq); goto out; } out: irq_put_desc_unlock(desc, flags); return ret; } /** * teardown_percpu_nmi - undoes NMI setup of IRQ line * @irq: Interrupt line from which CPU local NMI configuration should be * removed * * This call undoes the setup done by prepare_percpu_nmi(). * * IRQ line should not be enabled for the current CPU. * * As a CPU local operation, this should be called from non-preemptible * context. */ void teardown_percpu_nmi(unsigned int irq) { unsigned long flags; struct irq_desc *desc; WARN_ON(preemptible()); desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); if (!desc) return; if (WARN_ON(!irq_is_nmi(desc))) goto out; irq_nmi_teardown(desc); out: irq_put_desc_unlock(desc, flags); } static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) return -ENODEV; if (chip->irq_get_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_get_irqchip_state(data, which, state); return err; } /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about * @state: a pointer to a boolean where the state is to be stored * * This call snapshots the internal irqchip state of an * interrupt, returning into @state the bit corresponding to * stage @which * * This function should be called with preemption disabled if the * interrupt controller has per-cpu registers. */ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { struct irq_desc *desc; struct irq_data *data; unsigned long flags; int err = -EINVAL; desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return err; data = irq_desc_get_irq_data(desc); err = __irq_get_irqchip_state(data, which, state); irq_put_desc_busunlock(desc, flags); return err; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** * irq_set_irqchip_state - set the state of a forwarded interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: State to be restored (one of IRQCHIP_STATE_*) * @val: Value corresponding to @which * * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * * This function should be called with migration disabled if the * interrupt controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; unsigned long flags; int err = -EINVAL; desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return err; data = irq_desc_get_irq_data(desc); do { chip = irq_data_get_irq_chip(data); if (WARN_ON_ONCE(!chip)) { err = -ENODEV; goto out_unlock; } if (chip->irq_set_irqchip_state) break; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY data = data->parent_data; #else data = NULL; #endif } while (data); if (data) err = chip->irq_set_irqchip_state(data, which, val); out_unlock: irq_put_desc_busunlock(desc, flags); return err; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); /** * irq_has_action - Check whether an interrupt is requested * @irq: The linux irq number * * Returns: A snapshot of the current state */ bool irq_has_action(unsigned int irq) { bool res; rcu_read_lock(); res = irq_desc_has_action(irq_to_desc(irq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_has_action); /** * irq_check_status_bit - Check whether bits in the irq descriptor status are set * @irq: The linux irq number * @bitmask: The bitmask to evaluate * * Returns: True if one of the bits in @bitmask is set */ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) { struct irq_desc *desc; bool res = false; rcu_read_lock(); desc = irq_to_desc(irq); if (desc) res = !!(desc->status_use_accessors & bitmask); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(irq_check_status_bit);
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ /* * kernel/workqueue_internal.h * * Workqueue internal header file. Only to be included by workqueue and * core kernel subsystems. */ #ifndef _KERNEL_WORKQUEUE_INTERNAL_H #define _KERNEL_WORKQUEUE_INTERNAL_H #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/preempt.h> struct worker_pool; /* * The poor guys doing the actual heavy lifting. All on-duty workers are * either serving the manager role, on idle list or on busy hash. For * details on the locking annotation (L, I, X...), refer to workqueue.c. * * Only to be used in workqueue and async. */ struct worker { /* on idle list while idle, on busy hash table while busy */ union { struct list_head entry; /* L: while idle */ struct hlist_node hentry; /* L: while busy */ }; struct work_struct *current_work; /* K: work being processed and its */ work_func_t current_func; /* K: function */ struct pool_workqueue *current_pwq; /* K: pwq */ u64 current_at; /* K: runtime at start or last wakeup */ unsigned int current_color; /* K: color */ int sleeping; /* S: is worker sleeping? */ /* used by the scheduler to determine a worker's last known identity */ work_func_t last_func; /* K: last work's fn */ struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* A: the associated pool */ /* L: for rescuers */ struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ unsigned long last_active; /* K: last active timestamp */ unsigned int flags; /* L: flags */ int id; /* I: worker id */ /* * Opaque string set with work_set_desc(). Printed out with task * dump for debugging - WARN, BUG, panic or sysrq. */ char desc[WORKER_DESC_LEN]; /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; /** * current_wq_worker - return struct worker if %current is a workqueue worker */ static inline struct worker *current_wq_worker(void) { if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } /* * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/ and workqueue.c. */ void wq_worker_running(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task); void wq_worker_tick(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
2 2 2 2 4 22 23 23 23 23 23 23 23 23 23 17 1 5 4 4 2 1 1 5 1 4 4 4 4 2 9 4 2 1 2 2 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/filter.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> #include <net/pptp.h> #include <net/tipc.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> #include <linux/dccp.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> #include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <net/pkt_cls.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_labels.h> #endif #include <linux/bpf-netns.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) { unsigned int i; memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); BUG_ON(dissector_uses_key(flow_dissector, key->key_id)); dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL)); BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides * any per-net-namespace one. When attaching to root, * make sure we don't have any BPF program attached * to the non-root namespaces. */ struct net *ns; for_each_net(ns) { if (ns == &init_net) continue; if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } return 0; } #endif /* CONFIG_BPF_SYSCALL */ /** * skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset * @data: raw buffer pointer to the packet, if NULL use skb->data * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); if (!data) { data = skb->data; hlen = skb_headlen(skb); } if (poff >= 0) { __be32 *ports, _ports; ports = __skb_header_pointer(skb, thoff + poff, sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } EXPORT_SYMBOL(skb_flow_get_ports); static bool icmp_has_id(u8 type) { switch (type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: return true; } return false; } /** * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields * @skb: sk_buff to extract from * @key_icmp: struct flow_dissector_key_icmp to fill * @data: raw buffer pointer to the packet * @thoff: offset to extract at * @hlen: packet header length */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); if (!ih) return; key_icmp->type = ih->type; key_icmp->code = ih->code; /* As we use 0 to signal that the Id field is not present, * avoid confusion with packets without such field */ if (icmp_has_id(ih->type)) key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; else key_icmp->id = 0; } EXPORT_SYMBOL(skb_flow_get_icmp_tci); /* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet * using skb_flow_get_icmp_tci(). */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) return; key_icmp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ICMP, target_container); skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } static void __skb_flow_dissect_ah(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_ah; struct ip_auth_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_ah = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_ah->spi = hdr->spi; } static void __skb_flow_dissect_esp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_esp; struct ip_esp_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_esp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_esp->spi = hdr->spi; } static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_l2tpv3 *key_l2tpv3; struct { __be32 session_id; } *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_l2tpv3 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3, target_container); key_l2tpv3->session_id = hdr->session_id; } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_meta *meta; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) return; meta = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) if (tc_skb_ext_tc_enabled()) { struct tc_skb_ext *ext; ext = skb_ext_find(skb, TC_SKB_EXT); if (ext) meta->l2_miss = ext->l2_miss; } #endif } EXPORT_SYMBOL(skb_flow_dissect_meta); static void skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, u32 ctrl_flags, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_control *ctrl; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) return; ctrl = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL, target_container); ctrl->addr_type = type; ctrl->flags = ctrl_flags; } void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; struct nf_conn *ct; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) return; ct = nf_ct_get(skb, &ctinfo); if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); if (!ct) { key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; key->ct_zone = zone; return; } if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); if (cl) memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); #endif /* CONFIG_NF_CONNTRACK */ } EXPORT_SYMBOL(skb_flow_dissect_ct); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; u32 ctrl_flags = 0; /* A quick check to see if there might be something to do. */ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) return; info = skb_tunnel_info(skb); if (!info) return; key = &info->key; if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; switch (ip_tunnel_info_af(info)) { case AF_INET: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_dissector_key_ipv4_addrs *ipv4; ipv4 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, target_container); ipv4->src = key->u.ipv4.src; ipv4->dst = key->u.ipv4.dst; } break; case AF_INET6: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *ipv6; ipv6 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, target_container); ipv6->src = key->u.ipv6.src; ipv6->dst = key->u.ipv6.dst; } break; default: skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, target_container); break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { struct flow_dissector_key_keyid *keyid; keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID, target_container); keyid->keyid = tunnel_id_to_key32(key->tun_id); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { struct flow_dissector_key_ports *tp; tp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, target_container); tp->src = key->tp_src; tp->dst = key->tp_dst; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { struct flow_dissector_key_ip *ip; ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP, target_container); ip->tos = key->tos; ip->ttl = key->ttl; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); if (!info->options_len) return; enc_opt->len = info->options_len; ip_tunnel_info_opts_get(enc_opt->data, info); ip_tunnel_set_options_present(flags); ip_tunnel_flags_and(flags, info->key.tun_flags, flags); val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, IP_TUNNEL_GENEVE_OPT_BIT); enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_hash *key; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_HASH, target_container); key->hash = skb_get_hash_raw(skb); } EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; if (lse_index >= FLOW_DIS_MPLS_MAX) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); lse = &key_mpls->ls[lse_index]; lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; lse->mpls_bos = bos; lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; lse->mpls_label = label; dissector_set_mpls_lse(key_mpls, lse_index); } if (*entropy_label && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { struct flow_dissector_key_keyid *key_keyid; key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); key_keyid->keyid = cpu_to_be32(label); } *entropy_label = label == MPLS_LABEL_ENTROPY; return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { unsigned char ar_sha[ETH_ALEN]; unsigned char ar_sip[4]; unsigned char ar_tha[ETH_ALEN]; unsigned char ar_tip[4]; } *arp_eth, _arp_eth; const struct arphdr *arp; struct arphdr _arp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) return FLOW_DISSECT_RET_OUT_GOOD; arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, hlen, &_arp); if (!arp) return FLOW_DISSECT_RET_OUT_BAD; if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_hln != ETH_ALEN || arp->ar_pln != 4 || (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST))) return FLOW_DISSECT_RET_OUT_BAD; arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), sizeof(_arp_eth), data, hlen, &_arp_eth); if (!arp_eth) return FLOW_DISSECT_RET_OUT_BAD; key_arp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ARP, target_container); memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); /* Only store the lower byte of the opcode; * this covers ARPOP_REPLY and ARPOP_REQUEST. */ key_arp->op = ntohs(arp->ar_op) & 0xff; ether_addr_copy(key_arp->sha, arp_eth->ar_sha); ether_addr_copy(key_arp->tha, arp_eth->ar_tha); return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_cfm(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_cfm *key, *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, target_container); key->mdl_ver = hdr->mdl_ver; key->opcode = hdr->opcode; return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { struct flow_dissector_key_keyid *key_keyid; struct gre_base_hdr *hdr, _hdr; int offset = 0; u16 gre_ver; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, *p_hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; /* Only look inside GRE without routing */ if (hdr->flags & GRE_ROUTING) return FLOW_DISSECT_RET_OUT_GOOD; /* Only look inside GRE for version 0 and 1 */ gre_ver = ntohs(hdr->flags & GRE_VERSION); if (gre_ver > 1) return FLOW_DISSECT_RET_OUT_GOOD; *p_proto = hdr->protocol; if (gre_ver) { /* Version1 must be PPTP, and check the flags */ if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) return FLOW_DISSECT_RET_OUT_GOOD; } offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) offset += sizeof_field(struct gre_full_hdr, csum) + sizeof_field(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; keyid = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_keyid), data, *p_hlen, &_keyid); if (!keyid) return FLOW_DISSECT_RET_OUT_BAD; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); if (gre_ver == 0) key_keyid->keyid = *keyid; else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } offset += sizeof_field(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) offset += sizeof_field(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { const struct ethhdr *eth; struct ethhdr _eth; eth = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_eth), data, *p_hlen, &_eth); if (!eth) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = eth->h_proto; offset += sizeof(*eth); /* Cap headers that we access via pointers at the * end of the Ethernet header as our maximum alignment * at that point is only 2 bytes. */ if (NET_IP_ALIGN) *p_hlen = *p_nhoff + offset; } } else { /* version 1, must be PPTP */ u8 _ppp_hdr[PPP_HDRLEN]; u8 *ppp_hdr; if (hdr->flags & GRE_ACK) offset += sizeof_field(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), data, *p_hlen, _ppp_hdr); if (!ppp_hdr) return FLOW_DISSECT_RET_OUT_BAD; switch (PPP_PROTOCOL(ppp_hdr)) { case PPP_IP: *p_proto = htons(ETH_P_IP); break; case PPP_IPV6: *p_proto = htons(ETH_P_IPV6); break; default: /* Could probably catch some more like MPLS */ break; } offset += PPP_HDRLEN; } *p_nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } /** * __skb_flow_dissect_batadv() - dissect batman-adv header * @skb: sk_buff to with the batman-adv header * @key_control: flow dissectors control key * @data: raw buffer pointer to the packet, if NULL use skb->data * @p_proto: pointer used to update the protocol to process next * @p_nhoff: pointer used to update inner network header offset * @hlen: packet header length * @flags: any combination of FLOW_DISSECTOR_F_* * * ETH_P_BATMAN packets are tried to be dissected. Only * &struct batadv_unicast packets are actually processed because they contain an * inner ethernet header and are usually followed by actual network header. This * allows the flow dissector to continue processing the packet. * * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, * otherwise FLOW_DISSECT_RET_OUT_BAD */ static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, const void *data, __be16 *p_proto, int *p_nhoff, int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; struct ethhdr eth; } *hdr, _hdr; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = hdr->eth.h_proto; *p_nhoff += sizeof(*hdr); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) return; th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); if (!th) return; if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) return; key_tcp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TCP, target_container); key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); } static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) key_ports_range = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); if (!key_ports && !key_ports_range) return; ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); if (key_ports) key_ports->ports = ports; if (key_ports_range) key_ports_range->tp.ports = ports; } static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = iph->tos; key_ip->ttl = iph->ttl; } static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = ipv6_get_dsfield(iph); key_ip->ttl = iph->hop_limit; } /* Maximum number of protocol headers that can be parsed in * __skb_flow_dissect */ #define MAX_FLOW_DISSECT_HDRS 15 static bool skb_flow_dissect_allowed(int *num_hdrs) { ++*num_hdrs; return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); key_control->thoff = flow_keys->thoff; if (flow_keys->is_frag) key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (flow_keys->is_first_frag) key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flow_keys->is_encap) key_control->flags |= FLOW_DIS_ENCAPSULATION; key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); key_basic->n_proto = flow_keys->n_proto; key_basic->ip_proto = flow_keys->ip_proto; if (flow_keys->addr_proto == ETH_P_IP && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); key_addrs->v4addrs.src = flow_keys->ipv4_src; key_addrs->v4addrs.dst = flow_keys->ipv4_dst; key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else if (flow_keys->addr_proto == ETH_P_IPV6 && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) { key_ports_range = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); key_ports_range->tp.src = flow_keys->sport; key_ports_range->tp.dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_keys->flow_label); } } u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); flow_keys->n_proto = proto; flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); flow_keys->flags = flags; result = bpf_prog_run_pin_on_cpu(prog, ctx); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) { return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; } /** * __skb_flow_dissect - extract the flow_keys struct and return it * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @flow_dissector: list of keys to dissect * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the * rest parameters. * * Caller must take care of zeroing target container memory. */ bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; if (!data) { data = skb->data; proto = skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); #if IS_ENABLED(CONFIG_NET_DSA) if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && proto == htons(ETH_P_XDSA))) { struct metadata_dst *md_dst = skb_metadata_dst(skb); const struct dsa_device_ops *ops; int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; /* Only DSA header taggers break flow dissection */ if (ops->needed_headroom && (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else dsa_tag_generic_flow_dissect(skb, &proto, &offset); hlen -= offset; nhoff += offset; } } #endif } /* It is ensured by skb_flow_dissector_init() that control key will * be always present. */ key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); /* It is ensured by skb_flow_dissector_init() that basic key will * be always present. */ key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); rcu_read_lock(); if (skb) { if (!net) { if (skb->dev) net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } } DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, .data = data, .data_end = data + hlen, }; __be16 n_proto = proto; struct bpf_prog *prog; u32 result; if (skb) { ctx.skb = skb; /* we can't use 'proto' in the skb case * because it might be set to skb->vlan_proto * which has been pulled from the data */ n_proto = skb->protocol; } prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); if (result != BPF_FLOW_DISSECTOR_CONTINUE) { __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); return result == BPF_OK; } } } rcu_read_unlock(); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; key_eth_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS, target_container); memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { struct flow_dissector_key_num_of_vlans *key_num_of_vlans; key_num_of_vlans = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_num_of_vlans->num_of_vlans = 0; } proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += iph->ihl * 4; ip_proto = iph->protocol; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); memcpy(&key_addrs->v4addrs.src, &iph->saddr, sizeof(key_addrs->v4addrs.src)); memcpy(&key_addrs->v4addrs.dst, &iph->daddr, sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (iph->frag_off & htons(IP_OFFSET)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } else { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } } break; } case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &iph->saddr, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &iph->daddr, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if ((dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL) || (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && ip6_flowlabel(iph)) { __be32 flow_label = ip6_flowlabel(iph); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_label); } if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); break; } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; __be16 saved_vlan_tpid = proto; if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { struct flow_dissector_key_num_of_vlans *key_nvs; key_nvs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_nvs->num_of_vlans++; } if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; } else { fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, dissector_vlan, target_container); if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; key_vlan->vlan_priority = (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; u16 ppp_proto; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* least significant bit of the most significant octet * indicates if protocol field was compressed */ ppp_proto = ntohs(hdr->proto); if (ppp_proto & 0x0100) { ppp_proto = ppp_proto >> 8; nhoff += PPPOE_SES_HLEN - 1; } else { nhoff += PPPOE_SES_HLEN; } if (ppp_proto == PPP_IP) { proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_IPV6) { proto = htons(ETH_P_IPV6); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_UC) { proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_MC) { proto = htons(ETH_P_MPLS_MC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto_is_valid(ppp_proto)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; } else { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE)) { struct flow_dissector_key_pppoe *key_pppoe; key_pppoe = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE, target_container); key_pppoe->session_id = hdr->hdr.sid; key_pppoe->ppp_proto = htons(ppp_proto); key_pppoe->type = htons(ETH_P_PPP_SES); } break; } case htons(ETH_P_TIPC): { struct tipc_basic_hdr *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TIPC)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC, target_container); key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, nhoff, hlen, mpls_lse, &mpls_el); nhoff += sizeof(struct mpls_label); mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += FCOE_HEADER_LEN; fdret = FLOW_DISSECT_RET_OUT_GOOD; break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): fdret = __skb_flow_dissect_arp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case htons(ETH_P_BATMAN): fdret = __skb_flow_dissect_batadv(skb, key_control, data, &proto, &nhoff, hlen, flags); break; case htons(ETH_P_1588): { struct ptp_header *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += sizeof(struct ptp_header); fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_PRP): case htons(ETH_P_HSR): { struct hsr_tag *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = hdr->encap_proto; nhoff += HSR_HLEN; fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_CFM): fdret = __skb_flow_dissect_cfm(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* Process result of proto processing */ switch (fdret) { case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; goto out_good; case FLOW_DISSECT_RET_CONTINUE: case FLOW_DISSECT_RET_IPPROTO_AGAIN: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } ip_proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (ip_proto) { case IPPROTO_GRE: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, target_container, data, &proto, &nhoff, &hlen, flags); break; case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: { u8 _opthdr[2], *opthdr; if (proto != htons(ETH_P_IPV6)) break; opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } case NEXTHDR_FRAGMENT: { struct frag_hdr _fh, *fh; if (proto != htons(ETH_P_IPV6)) break; fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), data, hlen, &_fh); if (!fh) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case IPPROTO_IPIP: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IP); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_IPV6: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IPV6); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_TCP: __skb_flow_dissect_tcp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_L2TP: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ESP: __skb_flow_dissect_esp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_AH: __skb_flow_dissect_ah(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: break; } if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) __skb_flow_dissect_ports(skb, flow_dissector, target_container, data, nhoff, ip_proto, hlen); /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; break; case FLOW_DISSECT_RET_IPPROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto ip_proto_again; break; case FLOW_DISSECT_RET_OUT_GOOD: case FLOW_DISSECT_RET_CONTINUE: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } out_good: ret = true; out: key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; return ret; out_bad: ret = false; goto out; } EXPORT_SYMBOL(__skb_flow_dissect); static siphash_aligned_key_t hashrnd; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static const void *flow_keys_hash_start(const struct flow_keys *flow) { BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); return &flow->FLOW_KEYS_HASH_START_FIELD; } static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: diff -= sizeof(flow->addrs.v4addrs); break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; case FLOW_DISSECTOR_KEY_TIPC: diff -= sizeof(flow->addrs.tipckey); break; } return sizeof(*flow) - diff; } __be32 flow_get_u32_src(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.src; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); case FLOW_DISSECTOR_KEY_TIPC: return flow->addrs.tipckey.key; default: return 0; } } EXPORT_SYMBOL(flow_get_u32_src); __be32 flow_get_u32_dst(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.dst; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.dst); default: return 0; } } EXPORT_SYMBOL(flow_get_u32_dst); /* Sort the source and destination IP and the ports, * to have consistent hash within the two directions */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: if ((__force u32)keys->addrs.v4addrs.dst < (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: addr_diff = memcmp(&keys->addrs.v6addrs.dst, &keys->addrs.v6addrs.src, sizeof(keys->addrs.v6addrs.dst)); if (addr_diff < 0) { for (i = 0; i < 4; i++) swap(keys->addrs.v6addrs.src.s6_addr32[i], keys->addrs.v6addrs.dst.s6_addr32[i]); } if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; } } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, const siphash_key_t *keyval) { u32 hash; __flow_hash_consistentify(keys); hash = siphash(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; return hash; } u32 flow_hash_from_keys(struct flow_keys *keys) { __flow_hash_secret_init(); return __flow_hash_from_keys(keys, &hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); u32 flow_hash_from_keys_seed(struct flow_keys *keys, const siphash_key_t *keyval) { return __flow_hash_from_keys(keys, keyval); } EXPORT_SYMBOL(flow_hash_from_keys_seed); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) { skb_flow_dissect_flow_keys(skb, keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } struct _flow_keys_digest_data { __be16 n_proto; u8 ip_proto; u8 padding; __be32 ports; __be32 src; __be32 dst; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow) { struct _flow_keys_digest_data *data = (struct _flow_keys_digest_data *)digest; BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); memset(digest, 0, sizeof(*digest)); data->n_proto = flow->basic.n_proto; data->ip_proto = flow->basic.ip_proto; data->ports = flow->ports.ports; data->src = flow->addrs.v4addrs.src; data->dst = flow->addrs.v4addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); /** * __skb_get_hash_net: calculate a flow hash * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { struct flow_keys keys; u32 hash; memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); __flow_hash_secret_init(); hash = __flow_hash_from_keys(&keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash_net); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) { struct flow_keys keys; return ___skb_get_hash(skb, &keys, perturb); } EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; /* skip L4 headers for fragments after the first */ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) return poff; switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; u8 _doff; doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), data, hlen, &_doff); if (!doff) return poff; poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: case IPPROTO_UDPLITE: poff += sizeof(struct udphdr); break; /* For the rest, we do not really care about header * extensions at this point for now. */ case IPPROTO_ICMP: poff += sizeof(struct icmphdr); break; case IPPROTO_ICMPV6: poff += sizeof(struct icmp6hdr); break; case IPPROTO_IGMP: poff += sizeof(struct igmphdr); break; case IPPROTO_DCCP: poff += sizeof(struct dccp_hdr); break; case IPPROTO_SCTP: poff += sizeof(struct sctphdr); break; } return poff; } /** * skb_get_poff - get the offset to the payload * @skb: sk_buff to get the payload offset from * * The function will get the offset to the payload as far as it could * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, sizeof(keys->addrs.v6addrs.src)); memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, sizeof(keys->addrs.v6addrs.dst)); keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; keys->ports.src = fl6->fl6_sport; keys->ports.dst = fl6->fl6_dport; keys->keyid.keyid = fl6->fl6_gre_key; keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); keys->basic.ip_proto = fl6->flowi6_proto; return flow_hash_from_keys(keys); } EXPORT_SYMBOL(__get_hash_from_flowi6); static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, }; static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, }; struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); struct flow_dissector flow_keys_basic_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { skb_flow_dissector_init(&flow_keys_dissector, flow_keys_dissector_keys, ARRAY_SIZE(flow_keys_dissector_keys)); skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } core_initcall(init_default_flow_dissectors);
834 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux security data structures for kernel objects. * * Author(s): Stephen Smalley, <stephen.smalley.work@gmail.com> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> * Copyright (C) 2016 Mellanox Technologies */ #ifndef _SELINUX_OBJSEC_H_ #define _SELINUX_OBJSEC_H_ #include <linux/list.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/binfmts.h> #include <linux/in.h> #include <linux/spinlock.h> #include <linux/lsm_hooks.h> #include <linux/msg.h> #include <net/net_namespace.h> #include "flask.h" #include "avc.h" struct task_security_struct { u32 osid; /* SID prior to last execve */ u32 sid; /* current SID */ u32 exec_sid; /* exec SID */ u32 create_sid; /* fscreate SID */ u32 keycreate_sid; /* keycreate SID */ u32 sockcreate_sid; /* fscreate SID */ } __randomize_layout; enum label_initialized { LABEL_INVALID, /* invalid or not initialized */ LABEL_INITIALIZED, /* initialized */ LABEL_PENDING }; struct inode_security_struct { struct inode *inode; /* back pointer to inode object */ struct list_head list; /* list of inode_security_struct */ u32 task_sid; /* SID of creating task */ u32 sid; /* SID of this object */ u16 sclass; /* security class of this object */ unsigned char initialized; /* initialization flag */ spinlock_t lock; }; struct file_security_struct { u32 sid; /* SID of open file description */ u32 fown_sid; /* SID of file owner (for SIGIO) */ u32 isid; /* SID of inode at the time of file open */ u32 pseqno; /* Policy seqno at the time of file open */ }; struct superblock_security_struct { u32 sid; /* SID of file system superblock */ u32 def_sid; /* default SID for labeling */ u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */ unsigned short behavior; /* labeling behavior */ unsigned short flags; /* which mount options were specified */ struct mutex lock; struct list_head isec_head; spinlock_t isec_lock; }; struct msg_security_struct { u32 sid; /* SID of message */ }; struct ipc_security_struct { u16 sclass; /* security class of this object */ u32 sid; /* SID of IPC resource */ }; struct netif_security_struct { struct net *ns; /* network namespace */ int ifindex; /* device index */ u32 sid; /* SID for this interface */ }; struct netnode_security_struct { union { __be32 ipv4; /* IPv4 node address */ struct in6_addr ipv6; /* IPv6 node address */ } addr; u32 sid; /* SID for this node */ u16 family; /* address family */ }; struct netport_security_struct { u32 sid; /* SID for this node */ u16 port; /* port number */ u8 protocol; /* transport protocol */ }; struct sk_security_struct { #ifdef CONFIG_NETLABEL enum { /* NetLabel state */ NLBL_UNSET = 0, NLBL_REQUIRE, NLBL_LABELED, NLBL_REQSKB, NLBL_CONNLABELED, } nlbl_state; struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */ #endif u32 sid; /* SID of this object */ u32 peer_sid; /* SID of peer */ u16 sclass; /* sock security class */ enum { /* SCTP association state */ SCTP_ASSOC_UNSET = 0, SCTP_ASSOC_SET, } sctp_assoc_state; }; struct tun_security_struct { u32 sid; /* SID for the tun device sockets */ }; struct key_security_struct { u32 sid; /* SID of key */ }; struct ib_security_struct { u32 sid; /* SID of the queue pair or MAD agent */ }; struct pkey_security_struct { u64 subnet_prefix; /* Port subnet prefix */ u16 pkey; /* PKey number */ u32 sid; /* SID of pkey */ }; struct bpf_security_struct { u32 sid; /* SID of bpf obj creator */ }; struct perf_event_security_struct { u32 sid; /* SID of perf_event obj creator */ }; extern struct lsm_blob_sizes selinux_blob_sizes; static inline struct task_security_struct *selinux_cred(const struct cred *cred) { return cred->security + selinux_blob_sizes.lbs_cred; } static inline struct file_security_struct *selinux_file(const struct file *file) { return file->f_security + selinux_blob_sizes.lbs_file; } static inline struct inode_security_struct * selinux_inode(const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + selinux_blob_sizes.lbs_inode; } static inline struct msg_security_struct * selinux_msg_msg(const struct msg_msg *msg_msg) { return msg_msg->security + selinux_blob_sizes.lbs_msg_msg; } static inline struct ipc_security_struct * selinux_ipc(const struct kern_ipc_perm *ipc) { return ipc->security + selinux_blob_sizes.lbs_ipc; } /* * get the subjective security ID of the current task */ static inline u32 current_sid(void) { const struct task_security_struct *tsec = selinux_cred(current_cred()); return tsec->sid; } static inline struct superblock_security_struct * selinux_superblock(const struct super_block *superblock) { return superblock->s_security + selinux_blob_sizes.lbs_superblock; } #ifdef CONFIG_KEYS static inline struct key_security_struct *selinux_key(const struct key *key) { return key->security + selinux_blob_sizes.lbs_key; } #endif /* CONFIG_KEYS */ static inline struct sk_security_struct *selinux_sock(const struct sock *sock) { return sock->sk_security + selinux_blob_sizes.lbs_sock; } static inline struct tun_security_struct *selinux_tun_dev(void *security) { return security + selinux_blob_sizes.lbs_tun_dev; } static inline struct ib_security_struct *selinux_ib(void *ib_sec) { return ib_sec + selinux_blob_sizes.lbs_ib; } static inline struct perf_event_security_struct * selinux_perf_event(void *perf_event) { return perf_event + selinux_blob_sizes.lbs_perf_event; } #endif /* _SELINUX_OBJSEC_H_ */
266 28 265 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_MM_H #define _LINUX_SCHED_MM_H #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/gfp.h> #include <linux/sync_core.h> #include <linux/sched/coredump.h> /* * Routines for handling mm_structs */ extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. * @mm: The &struct mm_struct to pin. * * Make sure that @mm will not get freed even after the owning task * exits. This doesn't guarantee that the associated address space * will still exist later on and mmget_not_zero() has to be used before * accessing it. * * This is a preferred way to pin @mm for a longer/unbounded amount * of time. * * Use mmdrop() to release the reference acquired by mmgrab(). * * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) { atomic_inc(&mm->mm_count); } static inline void smp_mb__after_mmgrab(void) { smp_mb__after_atomic(); } extern void __mmdrop(struct mm_struct *mm); static inline void mmdrop(struct mm_struct *mm) { /* * The implicit full barrier implied by atomic_dec_and_test() is * required by the membarrier system call before returning to * user-space, after storing to rq->curr. */ if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } #ifdef CONFIG_PREEMPT_RT /* * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is * by far the least expensive way to do that. */ static inline void __mmdrop_delayed(struct rcu_head *rhp) { struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); __mmdrop(mm); } /* * Invoked from finish_task_switch(). Delegates the heavy lifting on RT * kernels via RCU. */ static inline void mmdrop_sched(struct mm_struct *mm) { /* Provides a full memory barrier. See mmdrop() */ if (atomic_dec_and_test(&mm->mm_count)) call_rcu(&mm->delayed_drop, __mmdrop_delayed); } #else static inline void mmdrop_sched(struct mm_struct *mm) { mmdrop(mm); } #endif /* Helpers for lazy TLB mm refcounting */ static inline void mmgrab_lazy_tlb(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) mmgrab(mm); } static inline void mmdrop_lazy_tlb(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) { mmdrop(mm); } else { /* * mmdrop_lazy_tlb must provide a full memory barrier, see the * membarrier comment finish_task_switch which relies on this. */ smp_mb(); } } static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) mmdrop_sched(mm); else smp_mb(); /* see mmdrop_lazy_tlb() above */ } /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. * * Make sure that the address space of the given &struct mm_struct doesn't * go away. This does not protect against parts of the address space being * modified or freed, however. * * Never use this function to pin this address space for an * unbounded/indefinite amount of time. * * Use mmput() to release the reference acquired by mmget(). * * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) { atomic_inc(&mm->mm_users); } static inline bool mmget_not_zero(struct mm_struct *mm) { return atomic_inc_not_zero(&mm->mm_users); } /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); #ifdef CONFIG_MMU /* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ void mmput_async(struct mm_struct *); #endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* * Grab a reference to a task's mm, if it is not already going away * and ptrace_may_access with the mode parameter passed to it * succeeds. */ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct on exit() */ extern void exit_mm_release(struct task_struct *, struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct on exec() */ extern void exec_mm_release(struct task_struct *, struct mm_struct *); #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU #ifndef arch_get_mmap_end #define arch_get_mmap_end(addr, len, flags) (TASK_SIZE) #endif #ifndef arch_get_mmap_base #define arch_get_mmap_base(addr, base) (base) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) { bool ret; /* * need RCU to access ->real_parent if CLONE_VM was used along with * CLONE_PARENT. * * We check real_parent->mm == tsk->mm because CLONE_VFORK does not * imply CLONE_VM * * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus * ->real_parent is not necessarily the task doing vfork(), so in * theory we can't rely on task_lock() if we want to dereference it. * * And in this case we can't trust the real_parent->mm == tsk->mm * check, it can be false negative. But we do not care, if init or * another oom-unkillable task does this it should blame itself. */ rcu_read_lock(); ret = tsk->vfork_done && rcu_dereference(tsk->real_parent)->mm == tsk->mm; rcu_read_unlock(); return ret; } /* * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence */ if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } return flags; } #ifdef CONFIG_LOCKDEP extern void __fs_reclaim_acquire(unsigned long ip); extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else static inline void __fs_reclaim_acquire(unsigned long ip) { } static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif /* Any memory-allocation retry loop should use * memalloc_retry_wait(), and pass the flags for the most * constrained allocation attempt that might have failed. * This provides useful documentation of where loops are, * and a central place to fine tune the waiting as the MM * implementation changes. */ static inline void memalloc_retry_wait(gfp_t gfp_flags) { /* We use io_schedule_timeout because waiting for memory * typically included waiting for dirty pages to be * written out, which requires IO. */ __set_current_state(TASK_UNINTERRUPTIBLE); gfp_flags = current_gfp_context(gfp_flags); if (gfpflags_allow_blocking(gfp_flags) && !(gfp_flags & __GFP_NORETRY)) /* Probably waited already, no need for much more */ io_schedule_timeout(1); else /* Probably didn't wait, and has now released a lock, * so now is a good time to wait */ io_schedule_timeout(HZ/50); } /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate * * Similar to might_sleep() and other annotations, this can be used in functions * that might allocate, but often don't. Compiles to nothing without * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking. */ static inline void might_alloc(gfp_t gfp_mask) { fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } /** * memalloc_flags_save - Add a PF_* flag to current->flags, save old value * * This allows PF_* flags to be conveniently added, irrespective of current * value, and then the old version restored with memalloc_flags_restore(). */ static inline unsigned memalloc_flags_save(unsigned flags) { unsigned oldflags = ~current->flags & flags; current->flags |= flags; return oldflags; } static inline void memalloc_flags_restore(unsigned flags) { current->flags &= ~flags; } /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * * This functions marks the beginning of the GFP_NOIO allocation scope. * All further allocations will implicitly drop __GFP_IO flag and so * they are safe for the IO critical section from the allocation recursion * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * * Context: This function is safe to be used from any context. * Return: The saved flags to be passed to memalloc_noio_restore. */ static inline unsigned int memalloc_noio_save(void) { return memalloc_flags_save(PF_MEMALLOC_NOIO); } /** * memalloc_noio_restore - Ends the implicit GFP_NOIO scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_noio_save call. */ static inline void memalloc_noio_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope. * * This functions marks the beginning of the GFP_NOFS allocation scope. * All further allocations will implicitly drop __GFP_FS flag and so * they are safe for the FS critical section from the allocation recursion * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * * Context: This function is safe to be used from any context. * Return: The saved flags to be passed to memalloc_nofs_restore. */ static inline unsigned int memalloc_nofs_save(void) { return memalloc_flags_save(PF_MEMALLOC_NOFS); } /** * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_nofs_save call. */ static inline void memalloc_nofs_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. * * This function marks the beginning of the __GFP_MEMALLOC allocation scope. * All further allocations will implicitly add the __GFP_MEMALLOC flag, which * prevents entering reclaim and allows access to all memory reserves. This * should only be used when the caller guarantees the allocation will allow more * memory to be freed very shortly, i.e. it needs to allocate some memory in * the process of freeing memory, and cannot reclaim due to potential recursion. * * Users of this scope have to be extremely careful to not deplete the reserves * completely and implement a throttling mechanism which controls the * consumption of the reserve based on the amount of freed memory. Usage of a * pre-allocated pool (e.g. mempool) should be always considered before using * this scope. * * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC * * Context: This function should not be used in an interrupt context as that one * does not give PF_MEMALLOC access to reserves. * See __gfp_pfmemalloc_flags(). * Return: The saved flags to be passed to memalloc_noreclaim_restore. */ static inline unsigned int memalloc_noreclaim_save(void) { return memalloc_flags_save(PF_MEMALLOC); } /** * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. * @flags: Flags to restore. * * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save * function. Always make sure that the given flags is the return value from the * pairing memalloc_noreclaim_save call. */ static inline void memalloc_noreclaim_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. * * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. * All further allocations will implicitly remove the __GFP_MOVABLE flag, which * will constraint the allocations to zones that allow long term pinning, i.e. * not ZONE_MOVABLE zones. * * Return: The saved flags to be passed to memalloc_pin_restore. */ static inline unsigned int memalloc_pin_save(void) { return memalloc_flags_save(PF_MEMALLOC_PIN); } /** * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. * @flags: Flags to restore. * * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. * Always make sure that the given flags is the return value from the pairing * memalloc_pin_save call. */ static inline void memalloc_pin_restore(unsigned int flags) { memalloc_flags_restore(flags); } #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * * Please, make sure that caller has a reference to the passed memcg structure, * so its lifetime is guaranteed to exceed the scope between two * set_active_memcg() calls. * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { old = current->active_memcg; current->active_memcg = memcg; } return old; } #else static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { return NULL; } #endif #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), }; enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), MEMBARRIER_FLAG_RSEQ = (1U << 1), }; #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS #include <asm/membarrier.h> #endif static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { /* * The atomic_read() below prevents CSE. The following should * help the compiler generate more efficient code on architectures * where sync_core_before_usermode() is a no-op. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE)) return; if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; sync_core_before_usermode(); } extern void membarrier_exec_mmap(struct mm_struct *mm); extern void membarrier_update_current_mm(struct mm_struct *next_mm); #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { } #endif static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } static inline void membarrier_update_current_mm(struct mm_struct *next_mm) { } #endif #endif /* _LINUX_SCHED_MM_H */
1548 239 239 1627 1609 137 1634 1 75 1 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_FILE_REF_H #define _LINUX_FILE_REF_H #include <linux/atomic.h> #include <linux/preempt.h> #include <linux/types.h> /* * file_ref is a reference count implementation specifically for use by * files. It takes inspiration from rcuref but differs in key aspects * such as support for SLAB_TYPESAFE_BY_RCU type caches. * * FILE_REF_ONEREF FILE_REF_MAXREF * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL * <-------------------valid -------------------> * * FILE_REF_SATURATED * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL * <-----------------------saturation zone----------------------> * * FILE_REF_RELEASED FILE_REF_DEAD * 0xC000000000000000UL 0xE000000000000000UL * <-------------------dead zone-------------------> * * FILE_REF_NOREF * 0xFFFFFFFFFFFFFFFFUL */ #ifdef CONFIG_64BIT #define FILE_REF_ONEREF 0x0000000000000000UL #define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL #define FILE_REF_SATURATED 0xA000000000000000UL #define FILE_REF_RELEASED 0xC000000000000000UL #define FILE_REF_DEAD 0xE000000000000000UL #define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL #else #define FILE_REF_ONEREF 0x00000000U #define FILE_REF_MAXREF 0x7FFFFFFFU #define FILE_REF_SATURATED 0xA0000000U #define FILE_REF_RELEASED 0xC0000000U #define FILE_REF_DEAD 0xE0000000U #define FILE_REF_NOREF 0xFFFFFFFFU #endif typedef struct { #ifdef CONFIG_64BIT atomic64_t refcnt; #else atomic_t refcnt; #endif } file_ref_t; /** * file_ref_init - Initialize a file reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) { atomic_long_set(&ref->refcnt, cnt - 1); } bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** * file_ref_get - Acquire one reference on a file * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. * * Provides full memory ordering. * * Return: False if the attempt to acquire a reference failed. This happens * when the last reference has been put already. True if a reference * was successfully acquired */ static __always_inline __must_check bool file_ref_get(file_ref_t *ref) { /* * Unconditionally increase the reference count with full * ordering. The saturation and dead zones provide enough * tolerance for this. * * If this indicates negative the file in question the fail can * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. * Hence, unconditionally altering the file reference count to * e.g., reset the file reference count back to the middle of * the deadzone risk end up marking someone else's file as dead * behind their back. * * It would be possible to do a careful: * * cnt = atomic_long_inc_return(); * if (likely(cnt >= 0)) * return true; * * and then something like: * * if (cnt >= FILE_REF_RELEASE) * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), * * to set the value back to the middle of the deadzone. But it's * practically impossible to go from FILE_REF_DEAD to * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 * file_ref_get()s to resurrect such a dead file. */ return !atomic_long_add_negative(1, &ref->refcnt); } /** * file_ref_inc - Acquire one reference on a file * @ref: Pointer to the reference count * * Acquire an additional reference on a file. Warns if the caller didn't * already hold a reference. */ static __always_inline void file_ref_inc(file_ref_t *ref) { long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); } /** * file_ref_put -- Release a file reference * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores * are done before, and provides an acquire ordering on success such * that free() must come after. * * Return: True if this was the last reference with no future references * possible. This signals the caller that it can safely release * the object which is protected by the reference counter. * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) { long cnt; /* * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() * calls don't risk UAFs when a file is recyclyed, it is still * vulnerable to UAFs caused by freeing the whole slab page once * it becomes unused. Prevent file_ref_put() from being * preempted protects against this. */ guard(preempt)(); /* * Unconditionally decrease the reference count. The saturation * and dead zones provide enough tolerance for this. If this * fails then we need to handle the last reference drop and * cases inside the saturation and dead zones. */ cnt = atomic_long_dec_return(&ref->refcnt); if (cnt >= 0) return false; return __file_ref_put(ref, cnt); } /** * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF * @ref: Pointer to the reference count * * Semantically it is equivalent to calling file_ref_put(), but it trades lower * performance in face of other CPUs also modifying the refcount for higher * performance when this happens to be the last reference. * * For the last reference file_ref_put() issues 2 atomics. One to drop the * reference and another to transition it to FILE_REF_DEAD. This routine does * the work in one step, but in order to do it has to pre-read the variable which * decreases scalability. * * Use with close() et al, stick to file_ref_put() by default. */ static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) { long old; old = atomic_long_read(&ref->refcnt); if (likely(old == FILE_REF_ONEREF)) { if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD))) return true; } return file_ref_put(ref); } /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned long file_ref_read(file_ref_t *ref) { unsigned long c = atomic_long_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= FILE_REF_RELEASED ? 0 : c + 1; } /* * __file_ref_read_raw - Return the value stored in ref->refcnt * @ref: Pointer to the reference count * * Return: The raw value found in the counter * * A hack for file_needs_f_pos_lock(), you probably want to use * file_ref_read() instead. */ static inline unsigned long __file_ref_read_raw(file_ref_t *ref) { return atomic_long_read(&ref->refcnt); } #endif
18 19 19 19 19 19 19 18 19 19 19 19 19 19 19 19 19 19 18 18 18 19 19 19 19 19 19 19 19 19 19 19 19 19 18 18 18 19 19 19 19 17 18 18 19 19 19 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/irqflags.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/bug.h> #include "printk_ringbuffer.h" #include "internal.h" /** * DOC: printk_ringbuffer overview * * Data Structure * -------------- * The printk_ringbuffer is made up of 3 internal ringbuffers: * * desc_ring * A ring of descriptors and their meta data (such as sequence number, * timestamp, loglevel, etc.) as well as internal state information about * the record and logical positions specifying where in the other * ringbuffer the text strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long * integer (ID) that maps to a desc_ring index followed by the text * string of the record. * * The internal state information of a descriptor is the key element to allow * readers and writers to locklessly synchronize access to the data. * * Implementation * -------------- * * Descriptor Ring * ~~~~~~~~~~~~~~~ * The descriptor ring is an array of descriptors. A descriptor contains * essential meta data to track the data of a printk record using * blk_lpos structs pointing to associated text data blocks (see * "Data Rings" below). Each descriptor is assigned an ID that maps * directly to index values of the descriptor array and has a state. The ID * and the state are bitwise combined into a single descriptor field named * @state_var, allowing ID and state to be synchronously and atomically * updated. * * Descriptors have four states: * * reserved * A writer is modifying the record. * * committed * The record and all its data are written. A writer can reopen the * descriptor (transitioning it back to reserved), but in the committed * state the data is consistent. * * finalized * The record and all its data are complete and available for reading. A * writer cannot reopen the descriptor. * * reusable * The record exists, but its text and/or meta data may no longer be * available. * * Querying the @state_var of a record requires providing the ID of the * descriptor to query. This can yield a possible fifth (pseudo) state: * * miss * The descriptor being queried has an unexpected ID. * * The descriptor ring has a @tail_id that contains the ID of the oldest * descriptor and @head_id that contains the ID of the newest descriptor. * * When a new descriptor should be created (and the ring is full), the tail * descriptor is invalidated by first transitioning to the reusable state and * then invalidating all tail data blocks up to and including the data blocks * associated with the tail descriptor (for the text ring). Then * @tail_id is advanced, followed by advancing @head_id. And finally the * @state_var of the new descriptor is initialized to the new ID and reserved * state. * * The @tail_id can only be advanced if the new @tail_id would be in the * committed or reusable queried state. This makes it possible that a valid * sequence number of the tail is always available. * * Descriptor Finalization * ~~~~~~~~~~~~~~~~~~~~~~~ * When a writer calls the commit function prb_commit(), record data is * fully stored and is consistent within the ringbuffer. However, a writer can * reopen that record, claiming exclusive access (as with prb_reserve()), and * modify that record. When finished, the writer must again commit the record. * * In order for a record to be made available to readers (and also become * recyclable for writers), it must be finalized. A finalized record cannot be * reopened and can never become "unfinalized". Record finalization can occur * in three different scenarios: * * 1) A writer can simultaneously commit and finalize its record by calling * prb_final_commit() instead of prb_commit(). * * 2) When a new record is reserved and the previous record has been * committed via prb_commit(), that previous record is automatically * finalized. * * 3) When a record is committed via prb_commit() and a newer record * already exists, the record being committed is automatically finalized. * * Data Ring * ~~~~~~~~~ * The text data ring is a byte array composed of data blocks. Data blocks are * referenced by blk_lpos structs that point to the logical position of the * beginning of a data block and the beginning of the next adjacent data * block. Logical positions are mapped directly to index values of the byte * array ringbuffer. * * Each data block consists of an ID followed by the writer data. The ID is * the identifier of a descriptor that is associated with the data block. A * given data block is considered valid if all of the following conditions * are met: * * 1) The descriptor associated with the data block is in the committed * or finalized queried state. * * 2) The blk_lpos struct within the descriptor associated with the data * block references back to the same data block. * * 3) The data block is within the head/tail logical position range. * * If the writer data of a data block would extend beyond the end of the * byte array, only the ID of the data block is stored at the logical * position and the full data block (ID and writer data) is stored at the * beginning of the byte array. The referencing blk_lpos will point to the * ID before the wrap and the next data block will be at the logical * position adjacent the full data block after the wrap. * * Data rings have a @tail_lpos that points to the beginning of the oldest * data block and a @head_lpos that points to the logical position of the * next (not yet existing) data block. * * When a new data block should be created (and the ring is full), tail data * blocks will first be invalidated by putting their associated descriptors * into the reusable state and then pushing the @tail_lpos forward beyond * them. Then the @head_lpos is pushed forward and is associated with a new * descriptor. If a data block is not valid, the @tail_lpos cannot be * advanced beyond it. * * Info Array * ~~~~~~~~~~ * The general meta data of printk records are stored in printk_info structs, * stored in an array with the same number of elements as the descriptor ring. * Each info corresponds to the descriptor of the same index in the * descriptor ring. Info validity is confirmed by evaluating the corresponding * descriptor before and after loading the info. * * Usage * ----- * Here are some simple examples demonstrating writers and readers. For the * examples a global ringbuffer (test_rb) is available (which is not the * actual ringbuffer used by printk):: * * DEFINE_PRINTKRB(test_rb, 15, 5); * * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of * 1 MiB (2 ^ (15 + 5)) for text data. * * Sample writer code:: * * const char *textstr = "message text"; * struct prb_reserved_entry e; * struct printk_record r; * * // specify how much to allocate * prb_rec_init_wr(&r, strlen(textstr) + 1); * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit and finalize the record * prb_final_commit(&e); * } * * Note that additional writer functions are available to extend a record * after it has been committed but not yet finalized. This can be done as * long as no new records have been reserved and the caller is the same. * * Sample writer code (record extending):: * * // alternate rest of previous example * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit the record (but do not finalize yet) * prb_commit(&e); * } * * ... * * // specify additional 5 bytes text space to extend * prb_rec_init_wr(&r, 5); * * // try to extend, but only if it does not exceed 32 bytes * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) { * snprintf(&r.text_buf[r.info->text_len], * r.text_buf_size - r.info->text_len, "hello"); * * r.info->text_len += 5; * * // commit and finalize the record * prb_final_commit(&e); * } * * Sample reader code:: * * struct printk_info info; * struct printk_record r; * char text_buf[32]; * u64 seq; * * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); * * prb_for_each_record(0, &test_rb, &seq, &r) { * if (info.seq != seq) * pr_warn("lost %llu records\n", info.seq - seq); * * if (info.text_len > r.text_buf_size) { * pr_warn("record %llu text truncated\n", info.seq); * text_buf[r.text_buf_size - 1] = 0; * } * * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, * &text_buf[0]); * } * * Note that additional less convenient reader functions are available to * allow complex record access. * * ABA Issues * ~~~~~~~~~~ * To help avoid ABA issues, descriptors are referenced by IDs (array index * values combined with tagged bits counting array wraps) and data blocks are * referenced by logical positions (array index values combined with tagged * bits counting array wraps). However, on 32-bit systems the number of * tagged bits is relatively small such that an ABA incident is (at least * theoretically) possible. For example, if 4 million maximally sized (1KiB) * printk messages were to occur in NMI context on a 32-bit system, the * interrupted context would not be able to recognize that the 32-bit integer * completely wrapped and thus represents a different data block than the one * the interrupted context expects. * * To help combat this possibility, additional state checking is performed * (such as using cmpxchg() even though set() would suffice). These extra * checks are commented as such and will hopefully catch any ABA issue that * a 32-bit system might experience. * * Memory Barriers * ~~~~~~~~~~~~~~~ * Multiple memory barriers are used. To simplify proving correctness and * generating litmus tests, lines of code related to memory barriers * (loads, stores, and the associated memory barriers) are labeled:: * * LMM(function:letter) * * Comments reference the labels using only the "function:letter" part. * * The memory barrier pairs and their ordering are: * * desc_reserve:D / desc_reserve:B * push descriptor tail (id), then push descriptor head (id) * * desc_reserve:D / data_push_tail:B * push data tail (lpos), then set new descriptor reserved (state) * * desc_reserve:D / desc_push_tail:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:D / prb_first_seq:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:F / desc_read:D * set new descriptor id and reserved (state), then allow writer changes * * data_alloc:A (or data_realloc:A) / desc_read:D * set old descriptor reusable (state), then modify new data block area * * data_alloc:A (or data_realloc:A) / data_push_tail:B * push data tail (lpos), then modify new data block area * * _prb_commit:B / desc_read:B * store writer changes, then set new descriptor committed (state) * * desc_reopen_last:A / _prb_commit:B * set descriptor reserved (state), then read descriptor data * * _prb_commit:B / desc_reserve:D * set new descriptor committed (state), then check descriptor head (id) * * data_push_tail:D / data_push_tail:A * set descriptor reusable (state), then push data tail (lpos) * * desc_push_tail:B / desc_reserve:D * set descriptor reusable (state), then push descriptor tail (id) * * desc_update_last_finalized:A / desc_last_finalized_seq:A * store finalized record, then set new highest finalized sequence number */ #define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) #define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) #define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) #define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) /* Determine the data array index from a logical position. */ #define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) /* Determine the desc array index from an ID or sequence number. */ #define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) /* Determine how many times the data array has wrapped. */ #define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) /* Determine if a logical position refers to a data-less block. */ #define LPOS_DATALESS(lpos) ((lpos) & 1UL) #define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ LPOS_DATALESS((blk)->next)) /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ ((lpos) & ~DATA_SIZE_MASK(data_ring)) /* Get the ID for the same index of the previous wrap as the given ID. */ #define DESC_ID_PREV_WRAP(desc_ring, id) \ DESC_ID((id) - DESCS_COUNT(desc_ring)) /* * A data block: mapped directly to the beginning of the data block area * specified as a logical position within the data ring. * * @id: the ID of the associated descriptor * @data: the writer data * * Note that the size of a data block is only known by its associated * descriptor. */ struct prb_data_block { unsigned long id; char data[]; }; /* * Return the descriptor associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; } /* * Return the printk_info associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; } static struct prb_data_block *to_block(struct prb_data_ring *data_ring, unsigned long begin_lpos) { return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; } /* * Increase the data size to account for data block meta data plus any * padding so that the adjacent data block is aligned on the ID size. */ static unsigned int to_blk_size(unsigned int size) { struct prb_data_block *db = NULL; size += sizeof(*db); size = ALIGN(size, sizeof(db->id)); return size; } /* * Sanity checker for reserve size. The ringbuffer code assumes that a data * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the * assumption is safe. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { struct prb_data_block *db = NULL; if (size == 0) return true; /* * Ensure the alignment padded size could possibly fit in the data * array. The largest possible data block must still leave room for * at least the ID of the next block. */ size = to_blk_size(size); if (size > DATA_SIZE(data_ring) - sizeof(db->id)) return false; return true; } /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) { if (id != DESC_ID(state_val)) return desc_miss; return DESC_STATE(state_val); } /* * Get a copy of a specified descriptor and return its queried state. If the * descriptor is in an inconsistent state (miss or reserved), the caller can * only expect the descriptor's @state_var field to be valid. * * The sequence number and caller_id can be optionally retrieved. Like all * non-state_var data, they are only valid if the descriptor is in a * consistent state. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, unsigned long id, struct prb_desc *desc_out, u64 *seq_out, u32 *caller_id_out) { struct printk_info *info = to_info(desc_ring, id); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; enum desc_state d_state; unsigned long state_val; /* Check the descriptor state. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ d_state = get_desc_state(id, state_val); if (d_state == desc_miss || d_state == desc_reserved) { /* * The descriptor is in an inconsistent state. Set at least * @state_var so that the caller can see the details of * the inconsistent state. */ goto out; } /* * Guarantee the state is loaded before copying the descriptor * content. This avoids copying obsolete descriptor content that might * not apply to the descriptor state. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_read:A reads from _prb_commit:B, then desc_read:C reads * from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * RMB from desc_read:A to desc_read:C */ smp_rmb(); /* LMM(desc_read:B) */ /* * Copy the descriptor data. The data is not valid until the * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ if (desc_out) { memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) *caller_id_out = info->caller_id; /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking * the state. This avoids reading an obsolete descriptor state * that may not apply to the copied content. This pairs with * desc_reserve:F. * * Memory barrier involvement: * * If desc_read:C reads from desc_reserve:G, then desc_read:E * reads from desc_reserve:F. * * Relies on: * * WMB from desc_reserve:F to desc_reserve:G * matching * RMB from desc_read:C to desc_read:E * * 2. Guarantee the record data is loaded before re-checking the * state. This avoids reading an obsolete descriptor state that may * not apply to the copied data. This pairs with data_alloc:A and * data_realloc:A. * * Memory barrier involvement: * * If copy_data:A reads from data_alloc:B, then desc_read:E * reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_alloc:B * matching * RMB from desc_read:C to desc_read:E * * Note: desc_make_reusable:A and data_alloc:B can be different * CPUs. However, the data_alloc:B CPU (which performs the * full memory barrier) must have previously seen * desc_make_reusable:A. */ smp_rmb(); /* LMM(desc_read:D) */ /* * The data has been copied. Return the current descriptor state, * which may have changed since the load above. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: if (desc_out) atomic_long_set(&desc_out->state_var, state_val); return d_state; } /* * Take a specified descriptor out of the finalized state by attempting * the transition from finalized to reusable. Either this context or some * other context will have been successful. */ static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { unsigned long val_finalized = DESC_SV(id, desc_finalized); unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; atomic_long_cmpxchg_relaxed(state_var, val_finalized, val_reusable); /* LMM(desc_make_reusable:A) */ } /* * Given the text data ring, put the associated descriptor of each * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either * the descriptor has not yet been finalized or another writer context has * already pushed the tail lpos past the problematic data block. Regardless, * on error the caller can re-load the tail lpos to determine the situation. */ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long lpos_begin, unsigned long lpos_end, unsigned long *lpos_out) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_desc_ring *desc_ring = &rb->desc_ring; struct prb_data_block *blk; enum desc_state d_state; struct prb_desc desc; struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; unsigned long id; /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { blk = to_block(data_ring, lpos_begin); /* * Load the block ID from the data block. This is a data race * against a writer that may have newly reserved this data * area. If the loaded value matches a valid descriptor ID, * the blk_lpos of that descriptor will be checked to make * sure it points back to this data block. If the check fails, * the data area has been recycled by another writer. */ id = blk->id; /* LMM(data_make_reusable:A) */ d_state = desc_read(desc_ring, id, &desc, NULL, NULL); /* LMM(data_make_reusable:B) */ switch (d_state) { case desc_miss: case desc_reserved: case desc_committed: return false; case desc_finalized: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; desc_make_reusable(desc_ring, id); break; case desc_reusable: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; break; } /* Advance @lpos_begin to the next data block. */ lpos_begin = blk_lpos->next; } *lpos_out = lpos_begin; return true; } /* * Advance the data ring tail to at least @lpos. This function puts * descriptors into the reusable state if the tail is pushed beyond * their associated data block. */ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) { struct prb_data_ring *data_ring = &rb->text_data_ring; unsigned long tail_lpos_new; unsigned long tail_lpos; unsigned long next_lpos; /* If @lpos is from a data-less block, there is nothing to do. */ if (LPOS_DATALESS(lpos)) return true; /* * Any descriptor states that have transitioned to reusable due to the * data tail being pushed to this loaded value will be visible to this * CPU. This pairs with data_push_tail:D. * * Memory barrier involvement: * * If data_push_tail:A reads from data_push_tail:D, then this CPU can * see desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_push_tail:D * matches * READFROM from data_push_tail:D to data_push_tail:A * thus * READFROM from desc_make_reusable:A to this CPU */ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ /* * Loop until the tail lpos is at or beyond @lpos. This condition * may already be satisfied, resulting in no full memory barrier * from data_push_tail:D being performed. However, since this CPU * sees the new tail lpos, any descriptor states that transitioned to * the reusable state must already be visible. */ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { /* * Make all descriptors reusable that are associated with * data blocks before @lpos. */ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) { /* * 1. Guarantee the block ID loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled data area causing the tail lpos to * have been previously pushed. This pairs with * data_alloc:A and data_realloc:A. * * Memory barrier involvement: * * If data_make_reusable:A reads from data_alloc:B, * then data_push_tail:C reads from * data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to data_alloc:B * matching * RMB from data_make_reusable:A to * data_push_tail:C * * Note: data_push_tail:D and data_alloc:B can be * different CPUs. However, the data_alloc:B * CPU (which performs the full memory * barrier) must have previously seen * data_push_tail:D. * * 2. Guarantee the descriptor state loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled descriptor causing the tail lpos to * have been previously pushed. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If data_make_reusable:B reads from * desc_reserve:F, then data_push_tail:C reads * from data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to desc_reserve:F * matching * RMB from data_make_reusable:B to * data_push_tail:C * * Note: data_push_tail:D and desc_reserve:F can * be different CPUs. However, the * desc_reserve:F CPU (which performs the * full memory barrier) must have previously * seen data_push_tail:D. */ smp_rmb(); /* LMM(data_push_tail:B) */ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos ); /* LMM(data_push_tail:C) */ if (tail_lpos_new == tail_lpos) return false; /* Another CPU pushed the tail. Try again. */ tail_lpos = tail_lpos_new; continue; } /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail lpos. A full * memory barrier is needed since other CPUs may have made * the descriptor states reusable. This pairs with * data_push_tail:A. */ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, next_lpos)) { /* LMM(data_push_tail:D) */ break; } } return true; } /* * Advance the desc ring tail. This function advances the tail by one * descriptor, thus invalidating the oldest descriptor. Before advancing * the tail, the tail descriptor is made reusable and all data blocks up to * and including the descriptor's data block are invalidated (i.e. the data * ring tail is pushed past the data block of the descriptor being made * reusable). */ static bool desc_push_tail(struct printk_ringbuffer *rb, unsigned long tail_id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); switch (d_state) { case desc_miss: /* * If the ID is exactly 1 wrap behind the expected, it is * in the process of being reserved by another writer and * must be considered reserved. */ if (DESC_ID(atomic_long_read(&desc.state_var)) == DESC_ID_PREV_WRAP(desc_ring, tail_id)) { return false; } /* * The ID has changed. Another writer must have pushed the * tail and recycled the descriptor already. Success is * returned because the caller is only interested in the * specified tail being pushed, which it was. */ return true; case desc_reserved: case desc_committed: return false; case desc_finalized: desc_make_reusable(desc_ring, tail_id); break; case desc_reusable: break; } /* * Data blocks must be invalidated before their associated * descriptor can be made available for recycling. Invalidating * them later is not possible because there is no way to trust * data blocks once their associated descriptor is gone. */ if (!data_push_tail(rb, desc.text_blk_lpos.next)) return false; /* * Check the next descriptor after @tail_id before pushing the tail * to it because the tail must always be in a finalized or reusable * state. The implementation of prb_first_seq() relies on this. * * A successful read implies that the next descriptor is less than or * equal to @head_id so there is no risk of pushing the tail past the * head. */ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, NULL, NULL); /* LMM(desc_push_tail:A) */ if (d_state == desc_finalized || d_state == desc_reusable) { /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail ID. This allows * verifying the recycled descriptor state. A full memory * barrier is needed since other CPUs may have made the * descriptor states reusable. This pairs with desc_reserve:D. */ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ } else { /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail ID in the * case that the descriptor has been recycled. This pairs * with desc_reserve:D. * * Memory barrier involvement: * * If desc_push_tail:A reads from desc_reserve:F, then * desc_push_tail:D reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB from desc_push_tail:A to desc_push_tail:D * * Note: desc_push_tail:B and desc_reserve:F can be different * CPUs. However, the desc_reserve:F CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_push_tail:C) */ /* * Re-check the tail ID. The descriptor following @tail_id is * not in an allowed tail state. But if the tail has since * been moved by another CPU, then it does not matter. */ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ return false; } return true; } /* Reserve a new descriptor, invalidating the oldest if necessary. */ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val; unsigned long id_prev_wrap; struct prb_desc *desc; unsigned long head_id; unsigned long id; head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); /* * Guarantee the head ID is read before reading the tail ID. * Since the tail ID is updated before the head ID, this * guarantees that @id_prev_wrap is never ahead of the tail * ID. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If desc_reserve:A reads from desc_reserve:D, then * desc_reserve:C reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:D * matching * RMB from desc_reserve:A to desc_reserve:C * * Note: desc_push_tail:B and desc_reserve:D can be different * CPUs. However, the desc_reserve:D CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_reserve:B) */ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id )) { /* LMM(desc_reserve:C) */ /* * Make space for the new descriptor by * advancing the tail. */ if (!desc_push_tail(rb, id_prev_wrap)) return false; } /* * 1. Guarantee the tail ID is read before validating the * recycled descriptor state. A read memory barrier is * sufficient for this. This pairs with desc_push_tail:B. * * Memory barrier involvement: * * If desc_reserve:C reads from desc_push_tail:B, then * desc_reserve:E reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to desc_push_tail:B * matching * RMB from desc_reserve:C to desc_reserve:E * * Note: desc_make_reusable:A and desc_push_tail:B can be * different CPUs. However, the desc_push_tail:B CPU * (which performs the full memory barrier) must have * previously seen desc_make_reusable:A. * * 2. Guarantee the tail ID is stored before storing the head * ID. This pairs with desc_reserve:B. * * 3. Guarantee any data ring tail changes are stored before * recycling the descriptor. Data ring tail changes can * happen via desc_push_tail()->data_push_tail(). A full * memory barrier is needed since another CPU may have * pushed the data ring tails. This pairs with * data_push_tail:B. * * 4. Guarantee a new tail ID is stored before recycling the * descriptor. A full memory barrier is needed since * another CPU may have pushed the tail ID. This pairs * with desc_push_tail:C and this also pairs with * prb_first_seq:C. * * 5. Guarantee the head ID is stored before trying to * finalize the previous descriptor. This pairs with * _prb_commit:B. */ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, id)); /* LMM(desc_reserve:D) */ desc = to_desc(desc_ring, id); /* * If the descriptor has been recycled, verify the old state val. * See "ABA Issues" about why this verification is performed. */ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ if (prev_state_val && get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { WARN_ON_ONCE(1); return false; } /* * Assign the descriptor a new ID and set its state to reserved. * See "ABA Issues" about why cmpxchg() instead of set() is used. * * Guarantee the new descriptor ID and state is stored before making * any other changes. A write memory barrier is sufficient for this. * This pairs with desc_read:D. */ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ WARN_ON_ONCE(1); return false; } /* Now data in @desc can be modified: LMM(desc_reserve:G) */ *id_out = id; return true; } /* Determine the end of a data block. */ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, unsigned long lpos, unsigned int size) { unsigned long begin_lpos; unsigned long next_lpos; begin_lpos = lpos; next_lpos = lpos + size; /* First check if the data block does not wrap. */ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) return next_lpos; /* Wrapping data blocks store their data at the beginning. */ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); } /* * Allocate a new data block, invalidating the oldest data block(s) * if necessary. This function also associates the data block with * a specified descriptor. */ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long begin_lpos; unsigned long next_lpos; if (size == 0) { /* * Data blocks are not created for empty lines. Instead, the * reader will recognize these special lpos values and handle * it appropriately. */ blk_lpos->begin = EMPTY_LINE_LPOS; blk_lpos->next = EMPTY_LINE_LPOS; return NULL; } size = to_blk_size(size); begin_lpos = atomic_long_read(&data_ring->head_lpos); do { next_lpos = get_next_lpos(data_ring, begin_lpos, size); if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { /* Failed to allocate, specify a data-less block. */ blk_lpos->begin = FAILED_LPOS; blk_lpos->next = FAILED_LPOS; return NULL; } /* * 1. Guarantee any descriptor states that have transitioned * to reusable are stored before modifying the newly * allocated data area. A full memory barrier is needed * since other CPUs may have made the descriptor states * reusable. See data_push_tail:A about why the reusable * states are visible. This pairs with desc_read:D. * * 2. Guarantee any updated tail lpos is stored before * modifying the newly allocated data area. Another CPU may * be in data_make_reusable() and is reading a block ID * from this area. data_make_reusable() can handle reading * a garbage block ID value, but then it must be able to * load a new tail lpos. A full memory barrier is needed * since other CPUs may have updated the tail lpos. This * pairs with data_push_tail:B. */ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, next_lpos)); /* LMM(data_alloc:A) */ blk = to_block(data_ring, begin_lpos); blk->id = id; /* LMM(data_alloc:B) */ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; } blk_lpos->begin = begin_lpos; blk_lpos->next = next_lpos; return &blk->data[0]; } /* * Try to resize an existing data block associated with the descriptor * specified by @id. If the resized data block should become wrapped, it * copies the old data to the new data block. If @size yields a data block * with the same or less size, the data block is left as is. * * Fail if this is not the last allocated data block or if there is not * enough space or it is not possible make enough space. * * Return a pointer to the beginning of the entire data buffer or NULL on * failure. */ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long head_lpos; unsigned long next_lpos; bool wrapped; /* Reallocation only works if @blk_lpos is the newest data block. */ head_lpos = atomic_long_read(&data_ring->head_lpos); if (head_lpos != blk_lpos->next) return NULL; /* Keep track if @blk_lpos was a wrapping data block. */ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); size = to_blk_size(size); next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); /* If the data block does not increase, there is nothing to do. */ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { if (wrapped) blk = to_block(data_ring, 0); else blk = to_block(data_ring, blk_lpos->begin); return &blk->data[0]; } if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) return NULL; /* The memory barrier involvement is the same as data_alloc:A. */ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, next_lpos)) { /* LMM(data_realloc:A) */ return NULL; } blk = to_block(data_ring, blk_lpos->begin); if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { struct prb_data_block *old_blk = blk; /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; if (!wrapped) { /* * Since the allocated space is now in the newly * created wrapping data block, copy the content * from the old data block. */ memcpy(&blk->data[0], &old_blk->data[0], (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); } } blk_lpos->next = next_lpos; return &blk->data[0]; } /* Return the number of bytes used by a data block. */ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { /* Data-less blocks take no space. */ if (BLK_DATALESS(blk_lpos)) return 0; if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * For wrapping data blocks, the trailing (wasted) space is * also counted. */ return (DATA_INDEX(data_ring, blk_lpos->next) + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * Given @blk_lpos, return a pointer to the writer data from the data block * and calculate the size of the data part. A NULL pointer is returned if * @blk_lpos specifies values that could never be legal. * * This function (used by readers) performs strict validation on the lpos * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static const char *get_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, unsigned int *data_size) { struct prb_data_block *db; /* Data-less data block description. */ if (BLK_DATALESS(blk_lpos)) { /* * Records that are just empty lines are also valid, even * though they do not have a data block. For such records * explicitly return empty string data to signify success. */ if (blk_lpos->begin == EMPTY_LINE_LPOS && blk_lpos->next == EMPTY_LINE_LPOS) { *data_size = 0; return ""; } /* Data lost, invalid, or otherwise unavailable. */ return NULL; } /* Regular data block: @begin less than @next and in same wrap. */ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && blk_lpos->begin < blk_lpos->next) { db = to_block(data_ring, blk_lpos->begin); *data_size = blk_lpos->next - blk_lpos->begin; /* Wrapping data block: @begin is one wrap behind @next. */ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == DATA_WRAPS(data_ring, blk_lpos->next)) { db = to_block(data_ring, 0); *data_size = DATA_INDEX(data_ring, blk_lpos->next); /* Illegal block description. */ } else { WARN_ON_ONCE(1); return NULL; } /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { return NULL; } /* A valid data block will always have at least an ID. */ if (WARN_ON_ONCE(*data_size < sizeof(db->id))) return NULL; /* Subtract block ID space from size to reflect data size. */ *data_size -= sizeof(db->id); return &db->data[0]; } /* * Attempt to transition the newest descriptor from committed back to reserved * so that the record can be modified by a writer again. This is only possible * if the descriptor is not yet finalized and the provided @caller_id matches. */ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, u32 caller_id, unsigned long *id_out) { unsigned long prev_state_val; enum desc_state d_state; struct prb_desc desc; struct prb_desc *d; unsigned long id; u32 cid; id = atomic_long_read(&desc_ring->head_id); /* * To reduce unnecessarily reopening, first check if the descriptor * state and caller ID are correct. */ d_state = desc_read(desc_ring, id, &desc, NULL, &cid); if (d_state != desc_committed || cid != caller_id) return NULL; d = to_desc(desc_ring, id); prev_state_val = DESC_SV(id, desc_committed); /* * Guarantee the reserved state is stored before reading any * record data. A full memory barrier is needed because @state_var * modification is followed by reading. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_reopen_last:A reads from _prb_commit:B, then * prb_reserve_in_last:A reads from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * MB If desc_reopen_last:A to prb_reserve_in_last:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ return NULL; } *id_out = id; return d; } /** * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer * used by the newest record. * * @e: The entry structure to setup. * @rb: The ringbuffer to re-reserve and extend data in. * @r: The record structure to allocate buffers for. * @caller_id: The caller ID of the caller (reserving writer). * @max_size: Fail if the extended size would be greater than this. * * This is the public function available to writers to re-reserve and extend * data. * * The writer specifies the text size to extend (not the new total size) by * setting the @text_buf_size field of @r. To ensure proper initialization * of @r, prb_rec_init_wr() should be used. * * This function will fail if @caller_id does not match the caller ID of the * newest record. In that case the caller must reserve new data using * prb_reserve(). * * Context: Any context. Disables local interrupts on success. * Return: true if text data could be extended, otherwise false. * * On success: * * - @r->text_buf points to the beginning of the entire text buffer. * * - @r->text_buf_size is set to the new total size of the buffer. * * - @r->info is not touched so that @r->info->text_len could be used * to append the text. * * - prb_record_text_space() can be used on @e to query the new * actually used space. * * Important: All @r->info fields will already be set with the current values * for the record. I.e. @r->info->text_len will be less than * @text_buf_size. Writers can use @r->info->text_len to know * where concatenation begins and writers should update * @r->info->text_len after concatenating. */ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id, unsigned int max_size) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; unsigned int data_size; struct prb_desc *d; unsigned long id; local_irq_save(e->irqflags); /* Transition the newest descriptor back to the reserved state. */ d = desc_reopen_last(desc_ring, caller_id, &id); if (!d) { local_irq_restore(e->irqflags); goto fail_reopen; } /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ info = to_info(desc_ring, id); /* * Set the @e fields here so that prb_commit() can be used if * anything fails from now on. */ e->rb = rb; e->id = id; /* * desc_reopen_last() checked the caller_id, but there was no * exclusive access at that point. The descriptor may have * changed since then. */ if (caller_id != info->caller_id) goto fail; if (BLK_DATALESS(&d->text_blk_lpos)) { if (WARN_ON_ONCE(info->text_len != 0)) { pr_warn_once("wrong text_len value (%hu, expecting 0)\n", info->text_len); info->text_len = 0; } if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } else { if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) goto fail; /* * Increase the buffer size to include the original size. If * the meta data (@text_len) is not sane, use the full data * block size. */ if (WARN_ON_ONCE(info->text_len > data_size)) { pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", info->text_len, data_size); info->text_len = data_size; } r->text_buf_size += info->text_len; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_realloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } if (r->text_buf_size && !r->text_buf) goto fail; r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: prb_commit(e); /* prb_commit() re-enabled interrupts. */ fail_reopen: /* Make it clear to the caller that the re-reserve failed. */ memset(r, 0, sizeof(*r)); return false; } /* * @last_finalized_seq value guarantees that all records up to and including * this sequence number are finalized and can be read. The only exception are * too old records which have already been overwritten. * * It is also guaranteed that @last_finalized_seq only increases. * * Be aware that finalized records following non-finalized records are not * reported because they are not yet available to the reader. For example, * a new record stored via printk() will not be available to a printer if * it follows a record that has not been finalized yet. However, once that * non-finalized record becomes finalized, @last_finalized_seq will be * appropriately updated and the full set of finalized records will be * available to the printer. And since each printk() caller will either * directly print or trigger deferred printing of all available unprinted * records, all printk() messages will get printed. */ static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long ulseq; /* * Guarantee the sequence number is loaded before loading the * associated record in order to guarantee that the record can be * seen by this CPU. This pairs with desc_update_last_finalized:A. */ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq ); /* LMM(desc_last_finalized_seq:A) */ return __ulseq_to_u64seq(rb, ulseq); } static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count); /* * Check if there are records directly following @last_finalized_seq that are * finalized. If so, update @last_finalized_seq to the latest of these * records. It is not allowed to skip over records that are not yet finalized. */ static void desc_update_last_finalized(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; u64 old_seq = desc_last_finalized_seq(rb); unsigned long oldval; unsigned long newval; u64 finalized_seq; u64 try_seq; try_again: finalized_seq = old_seq; try_seq = finalized_seq + 1; /* Try to find later finalized records. */ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) { finalized_seq = try_seq; try_seq++; } /* No update needed if no later finalized record was found. */ if (finalized_seq == old_seq) return; oldval = __u64seq_to_ulseq(old_seq); newval = __u64seq_to_ulseq(finalized_seq); /* * Set the sequence number of a later finalized record that has been * seen. * * Guarantee the record data is visible to other CPUs before storing * its sequence number. This pairs with desc_last_finalized_seq:A. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then desc_read:A reads from * _prb_commit:B. * * Relies on: * * RELEASE from _prb_commit:B to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to desc_read:A * * Note: _prb_commit:B and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A * CPU (which performs the release) must have previously seen * _prb_commit:B. */ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq, &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */ old_seq = __ulseq_to_u64seq(rb, oldval); goto try_again; } } /* * Attempt to finalize a specified descriptor. If this fails, the descriptor * is either already final or it will finalize itself when the writer commits. */ static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val = DESC_SV(id, desc_committed); struct prb_desc *d = to_desc(desc_ring, id); if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val, DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */ desc_update_last_finalized(rb); } } /** * prb_reserve() - Reserve space in the ringbuffer. * * @e: The entry structure to setup. * @rb: The ringbuffer to reserve data in. * @r: The record structure to allocate buffers for. * * This is the public function available to writers to reserve data. * * The writer specifies the text size to reserve by setting the * @text_buf_size field of @r. To ensure proper initialization of @r, * prb_rec_init_wr() should be used. * * Context: Any context. Disables local interrupts on success. * Return: true if at least text data could be allocated, otherwise false. * * On success, the fields @info and @text_buf of @r will be set by this * function and should be filled in by the writer before committing. Also * on success, prb_record_text_space() can be used on @e to query the actual * space used for the text data block. * * Important: @info->text_len needs to be set correctly by the writer in * order for data to be readable and/or extended. Its value * is initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; struct prb_desc *d; unsigned long id; u64 seq; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; /* * Descriptors in the reserved state act as blockers to all further * reservations once the desc_ring has fully wrapped. Disable * interrupts during the reserve/commit window in order to minimize * the likelihood of this happening. */ local_irq_save(e->irqflags); if (!desc_reserve(rb, &id)) { /* Descriptor reservation failures are tracked. */ atomic_long_inc(&rb->fail); local_irq_restore(e->irqflags); goto fail; } d = to_desc(desc_ring, id); info = to_info(desc_ring, id); /* * All @info fields (except @seq) are cleared and must be filled in * by the writer. Save @seq before clearing because it is used to * determine the new sequence number. */ seq = info->seq; memset(info, 0, sizeof(*info)); /* * Set the @e fields here so that prb_commit() can be used if * text data allocation fails. */ e->rb = rb; e->id = id; /* * Initialize the sequence number if it has "never been set". * Otherwise just increment it by a full wrap. * * @seq is considered "never been set" if it has a value of 0, * _except_ for @infos[0], which was specially setup by the ringbuffer * initializer and therefore is always considered as set. * * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) info->seq = DESC_INDEX(desc_ring, id); else info->seq = seq + DESCS_COUNT(desc_ring); /* * New data is about to be reserved. Once that happens, previous * descriptors are no longer able to be extended. Finalize the * previous descriptor now so that it can be made available to * readers. (For seq==0 there is no previous descriptor.) */ if (info->seq > 0) desc_make_final(rb, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ if (r->text_buf_size && !r->text_buf) { prb_commit(e); /* prb_commit() re-enabled interrupts. */ goto fail; } r->info = info; /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: /* Make it clear to the caller that the reserve failed. */ memset(r, 0, sizeof(*r)); return false; } /* Commit the data (possibly finalizing it) and restore interrupts. */ static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); /* Now the writer has finished all writing: LMM(_prb_commit:A) */ /* * Set the descriptor as committed. See "ABA Issues" about why * cmpxchg() instead of set() is used. * * 1 Guarantee all record data is stored before the descriptor state * is stored as committed. A write memory barrier is sufficient * for this. This pairs with desc_read:B and desc_reopen_last:A. * * 2. Guarantee the descriptor state is stored as committed before * re-checking the head ID in order to possibly finalize this * descriptor. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If prb_commit:A reads from desc_reserve:D, then * desc_make_final:A reads from _prb_commit:B. * * Relies on: * * MB _prb_commit:B to prb_commit:A * matching * MB desc_reserve:D to desc_make_final:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ WARN_ON_ONCE(1); } /* Restore interrupts, the reserve/commit window is finished. */ local_irq_restore(e->irqflags); } /** * prb_commit() - Commit (previously reserved) data to the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit data. * * Note that the data is not yet available to readers until it is finalized. * Finalizing happens automatically when space for the next record is * reserved. * * See prb_final_commit() for a version of this function that finalizes * immediately. * * Context: Any context. Enables local interrupts. */ void prb_commit(struct prb_reserved_entry *e) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; unsigned long head_id; _prb_commit(e, desc_committed); /* * If this descriptor is no longer the head (i.e. a new record has * been allocated), extending the data for this record is no longer * allowed and therefore it must be finalized. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ if (head_id != e->id) desc_make_final(e->rb, e->id); } /** * prb_final_commit() - Commit and finalize (previously reserved) data to * the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit+finalize data. * * By finalizing, the data is made immediately available to readers. * * This function should only be used if there are no intentions of extending * this data using prb_reserve_in_last(). * * Context: Any context. Enables local interrupts. */ void prb_final_commit(struct prb_reserved_entry *e) { _prb_commit(e, desc_finalized); desc_update_last_finalized(e->rb); } /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional * line. */ static unsigned int count_lines(const char *text, unsigned int text_size) { unsigned int next_size = text_size; unsigned int line_count = 1; const char *next = text; while (next_size) { next = memchr(next, '\n', next_size); if (!next) break; line_count++; next++; next_size = text_size - (next - text); } return line_count; } /* * Given @blk_lpos, copy an expected @len of data into the provided buffer. * If @line_count is provided, count the number of lines in the data. * * This function (used by readers) performs strict validation on the data * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static bool copy_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, unsigned int buf_size, unsigned int *line_count) { unsigned int data_size; const char *data; /* Caller might not want any data. */ if ((!buf || !buf_size) && !line_count) return true; data = get_data(data_ring, blk_lpos, &data_size); if (!data) return false; /* * Actual cannot be less than expected. It can be more than expected * because of the trailing alignment padding. * * Note that invalid @len values can occur because the caller loads * the value during an allowed data race. */ if (data_size < (unsigned int)len) return false; /* Caller interested in the line count? */ if (line_count) *line_count = count_lines(data, len); /* Caller interested in the data content? */ if (!buf || !buf_size) return true; data_size = min_t(unsigned int, buf_size, len); memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; } /* * This is an extended version of desc_read(). It gets a copy of a specified * descriptor. However, it also verifies that the record is finalized and has * the sequence number @seq. On success, 0 is returned. * * Error return values: * -EINVAL: A finalized record with sequence number @seq does not exist. * -ENOENT: A finalized record with sequence number @seq exists, but its data * is not available. This is a valid record, so readers should * continue with the next record. */ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, unsigned long id, u64 seq, struct prb_desc *desc_out) { struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; enum desc_state d_state; u64 s; d_state = desc_read(desc_ring, id, desc_out, &s, NULL); /* * An unexpected @id (desc_miss) or @seq mismatch means the record * does not exist. A descriptor in the reserved or committed state * means the record does not yet exist for the reader. */ if (d_state == desc_miss || d_state == desc_reserved || d_state == desc_committed || s != seq) { return -EINVAL; } /* * A descriptor in the reusable state may no longer have its data * available; report it as existing but with lost data. Or the record * may actually be a record with lost data. */ if (d_state == desc_reusable || (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { return -ENOENT; } return 0; } /* * Copy the ringbuffer data from the record with @seq to the provided * @r buffer. On success, 0 is returned. * * See desc_read_finalized_seq() for error return values. */ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info = to_info(desc_ring, seq); struct prb_desc *rdesc = to_desc(desc_ring, seq); atomic_long_t *state_var = &rdesc->state_var; struct prb_desc desc; unsigned long id; int err; /* Extract the ID, used to specify the descriptor to read. */ id = DESC_ID(atomic_long_read(state_var)); /* Get a local copy of the correct descriptor (if available). */ err = desc_read_finalized_seq(desc_ring, id, seq, &desc); /* * If @r is NULL, the caller is only interested in the availability * of the record. */ if (err || !r) return err; /* If requested, copy meta data. */ if (r->info) memcpy(r->info, info, sizeof(*(r->info))); /* Copy text data. If it fails, this is a data-less record. */ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, r->text_buf, r->text_buf_size, line_count)) { return -ENOENT; } /* Ensure the record is still finalized and has the same @seq. */ return desc_read_finalized_seq(desc_ring, id, seq, &desc); } /* Get the sequence number of the tail descriptor. */ u64 prb_first_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; unsigned long id; u64 seq; for (;;) { id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ /* * This loop will not be infinite because the tail is * _always_ in the finalized or reusable state. */ if (d_state == desc_finalized || d_state == desc_reusable) break; /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail in the case * that the descriptor has been recycled. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If prb_first_seq:B reads from desc_reserve:F, then * prb_first_seq:A reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB prb_first_seq:B to prb_first_seq:A */ smp_rmb(); /* LMM(prb_first_seq:C) */ } return seq; } /** * prb_next_reserve_seq() - Get the sequence number after the most recently * reserved record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what sequence * number will be assigned to the next reserved record. * * Note that depending on the situation, this value can be equal to or * higher than the sequence number returned by prb_next_seq(). * * Context: Any context. * Return: The sequence number that will be assigned to the next record * reserved. */ u64 prb_next_reserve_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long last_finalized_id; atomic_long_t *state_var; u64 last_finalized_seq; unsigned long head_id; struct prb_desc desc; unsigned long diff; struct prb_desc *d; int err; /* * It may not be possible to read a sequence number for @head_id. * So the ID of @last_finailzed_seq is used to calculate what the * sequence number of @head_id will be. */ try_again: last_finalized_seq = desc_last_finalized_seq(rb); /* * @head_id is loaded after @last_finalized_seq to ensure that * it points to the record with @last_finalized_seq or newer. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then * prb_next_reserve_seq:A reads from desc_reserve:D. * * Relies on: * * RELEASE from desc_reserve:D to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A * * Note: desc_reserve:D and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A CPU * (which performs the release) must have previously seen * desc_read:C, which implies desc_reserve:D can be seen. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */ d = to_desc(desc_ring, last_finalized_seq); state_var = &d->state_var; /* Extract the ID, used to specify the descriptor to read. */ last_finalized_id = DESC_ID(atomic_long_read(state_var)); /* Ensure @last_finalized_id is correct. */ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc); if (err == -EINVAL) { if (last_finalized_seq == 0) { /* * No record has been finalized or even reserved yet. * * The @head_id is initialized such that the first * increment will yield the first record (seq=0). * Handle it separately to avoid a negative @diff * below. */ if (head_id == DESC0_ID(desc_ring->count_bits)) return 0; /* * One or more descriptors are already reserved. Use * the descriptor ID of the first one (@seq=0) for * the @diff below. */ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1; } else { /* Record must have been overwritten. Try again. */ goto try_again; } } /* Diff of known descriptor IDs to compute related sequence numbers. */ diff = head_id - last_finalized_id; /* * @head_id points to the most recently reserved record, but this * function returns the sequence number that will be assigned to the * next (not yet reserved) record. Thus +1 is needed. */ return (last_finalized_seq + diff + 1); } /* * Non-blocking read of a record. * * On success @seq is updated to the record that was read and (if provided) * @r and @line_count will contain the read/calculated data. * * On failure @seq is updated to a record that is not yet available to the * reader, but it will be the next record available to the reader. * * Note: When the current CPU is in panic, this function will skip over any * non-existent/non-finalized records in order to allow the panic CPU * to print any and all records that have been finalized. */ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count) { u64 tail_seq; int err; while ((err = prb_read(rb, *seq, r, line_count))) { tail_seq = prb_first_seq(rb); if (*seq < tail_seq) { /* * Behind the tail. Catch up and try again. This * can happen for -ENOENT and -EINVAL cases. */ *seq = tail_seq; } else if (err == -ENOENT) { /* Record exists, but the data was lost. Skip. */ (*seq)++; } else { /* * Non-existent/non-finalized record. Must stop. * * For panic situations it cannot be expected that * non-finalized records will become finalized. But * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this * non-existent/non-finalized record unless non-panic * CPUs are still running and their debugging is * explicitly enabled. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception * might be the last message without trailing newline. * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ if (this_cpu_in_panic() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; } else { return false; } } } return true; } /** * prb_read_valid() - Non-blocking read of a requested record or (if gone) * the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @r: A record data buffer to store the read record to. * * This is the public function available to readers to read a record. * * The reader provides the @info and @text_buf buffers of @r to be * filled in. Any of the buffer pointers can be set to NULL if the reader * is not interested in that data. To ensure proper initialization of @r, * prb_rec_init_rd() should be used. * * Context: Any context. * Return: true if a record was read, otherwise false. * * On success, the reader must check r->info.seq to see which record was * actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r) { return _prb_read_valid(rb, &seq, r, NULL); } /** * prb_read_valid_info() - Non-blocking read of meta data for a requested * record or (if gone) the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @info: A buffer to store the read record meta data to. * @line_count: A buffer to store the number of lines in the record text. * * This is the public function available to readers to read only the * meta data of a record. * * The reader provides the @info, @line_count buffers to be filled in. * Either of the buffer pointers can be set to NULL if the reader is not * interested in that data. * * Context: Any context. * Return: true if a record's meta data was read, otherwise false. * * On success, the reader must check info->seq to see which record meta data * was actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count) { struct printk_record r; prb_rec_init_rd(&r, info, NULL, 0); return _prb_read_valid(rb, &seq, &r, line_count); } /** * prb_first_valid_seq() - Get the sequence number of the oldest available * record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the * first/oldest valid sequence number is. * * This provides readers a starting point to begin iterating the ringbuffer. * * Context: Any context. * Return: The sequence number of the first/oldest record or, if the * ringbuffer is empty, 0 is returned. */ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) { u64 seq = 0; if (!_prb_read_valid(rb, &seq, NULL, NULL)) return 0; return seq; } /** * prb_next_seq() - Get the sequence number after the last available record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the next * newest sequence number available to readers will be. * * This provides readers a sequence number to jump to if all currently * available records should be skipped. It is guaranteed that all records * previous to the returned value have been finalized and are (or were) * available to the reader. * * Context: Any context. * Return: The sequence number of the next newest (not yet available) record * for readers. */ u64 prb_next_seq(struct printk_ringbuffer *rb) { u64 seq; seq = desc_last_finalized_seq(rb); /* * Begin searching after the last finalized record. * * On 0, the search must begin at 0 because of hack#2 * of the bootstrapping phase it is not known if a * record at index 0 exists. */ if (seq != 0) seq++; /* * The information about the last finalized @seq might be inaccurate. * Search forward to find the current one. */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; return seq; } /** * prb_init() - Initialize a ringbuffer to use provided external buffers. * * @rb: The ringbuffer to initialize. * @text_buf: The data buffer for text data. * @textbits: The size of @text_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. * @infos: The printk_info buffer for ringbuffer records. * * This is the public function available to writers to setup a ringbuffer * during runtime using provided buffers. * * This must match the initialization of DEFINE_PRINTKRB(). * * Context: Any context. */ void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, struct prb_desc *descs, unsigned int descbits, struct printk_info *infos) { memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); rb->desc_ring.count_bits = descbits; rb->desc_ring.descs = descs; rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.last_finalized_seq, 0); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->fail, 0); atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; } /** * prb_record_text_space() - Query the full actual used ringbuffer space for * the text data of a reserved entry. * * @e: The successfully reserved entry to query. * * This is the public function available to writers to see how much actual * space is used in the ringbuffer to store the text data of the specified * entry. * * This function is only valid if @e has been successfully reserved using * prb_reserve(). * * Context: Any context. * Return: The size in bytes used by the text data of the associated record. */ unsigned int prb_record_text_space(struct prb_reserved_entry *e) { return e->text_space; }
285 284 10 350 169 72 230 229 230 349 349 3 345 345 348 1 334 20 230 141 346 1 100 139 1 133 2 31 157 155 4 3 222 3 1 65 2 95 1 1 5 237 122 24 99 71 99 209 350 82 11 82 163 1 162 162 346 345 346 346 162 281 347 347 97 230 74 141 1 343 347 105 105 288 293 609 606 14 90 336 1 335 290 28 34 205 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 // SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * * Written by obz. * * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/memfd.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #define CREATE_TRACE_POINTS #include <trace/events/mmap.h> #include "internal.h" #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check * @len: The size of increase. * * Return: 0 on success. */ static int check_brk_limits(unsigned long addr, unsigned long len) { unsigned long mapped_addr; mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate = false; LIST_HEAD(uf); struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (current->brk_randomized) min_brk = mm->start_brk; else min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif if (brk < min_brk) goto out; /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) { mm->brk = brk; goto success; } /* Always allow shrinking brk. */ if (brk <= mm->brk) { /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. * do_vmi_align_munmap() will drop the lock on success, so * update it before calling do_vma_munmap(). */ mm->brk = brk; if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf, /* unlock = */ true)) goto out; goto success_unlocked; } if (check_brk_limits(oldbrk, newbrk - oldbrk)) goto out; /* * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ vma_iter_init(&vmi, mm, oldbrk); next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; if (mm->def_flags & VM_LOCKED) populate = true; success: mmap_write_unlock(mm); success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; } /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr */ static inline unsigned long round_hint_to_min(unsigned long hint) { hint &= PAGE_MASK; if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); return hint; } bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; locked_pages += mm->locked_vm; limit_pages = rlimit(RLIMIT_MEMLOCK); limit_pages >>= PAGE_SHIFT; return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) { if (S_ISREG(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISSOCK(inode->i_mode)) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ return ULONG_MAX; } static inline bool file_mmap_ok(struct file *file, struct inode *inode, unsigned long pgoff, unsigned long len) { u64 maxsize = file_mmap_size_max(file, inode); if (maxsize && len > maxsize) return false; maxsize -= len; if (pgoff > maxsize >> PAGE_SHIFT) return false; return true; } /** * do_mmap() - Perform a userland memory mapping into the current process * address space of length @len with protection bits @prot, mmap flags @flags * (from which VMA flags will be inferred), and any additional VMA flags to * apply @vm_flags. If this is a file-backed mapping then the file is specified * in @file and page offset into the file via @pgoff. * * This function does not perform security checks on the file and assumes, if * @uf is non-NULL, the caller has provided a list head to track unmap events * for userfaultfd @uf. * * It also simply indicates whether memory population is required by setting * @populate, which must be non-NULL, expecting the caller to actually perform * this task itself if appropriate. * * This function will invoke architecture-specific (and if provided and * relevant, file system-specific) logic to determine the most appropriate * unmapped area in which to place the mapping if not MAP_FIXED. * * Callers which require userland mmap() behaviour should invoke vm_mmap(), * which is also exported for module use. * * Those which require this behaviour less security checks, userfaultfd and * populate behaviour, and who handle the mmap write lock themselves, should * call this function. * * Note that the returned address may reside within a merged VMA if an * appropriate merge were to take place, so it doesn't necessarily specify the * start of a VMA, rather only the start of a valid mapped range of length * @len bytes, rounded down to the nearest page size. * * The caller must write-lock current->mm->mmap_lock. * * @file: An optional struct file pointer describing the file which is to be * mapped, if a file-backed mapping. * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the * address at which to perform this mapping. See mmap (2) for details. Must be * page-aligned. * @len: The length of the mapping. Will be page-aligned and must be at least 1 * page in size. * @prot: Protection bits describing access required to the mapping. See mmap * (2) for details. * @flags: Flags specifying how the mapping should be performed, see mmap (2) * for details. * @vm_flags: VMA flags which should be set by default, or 0 otherwise. * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise. * @populate: A pointer to a value which will be set to 0 if no population of * the range is required, or the number of bytes to populate if it is. Must be * non-NULL. See mmap (2) for details as to under what circumstances population * of the range occurs. * @uf: An optional pointer to a list head to track userfaultfd unmap events * should unmapping events arise. If provided, it is up to the caller to manage * this. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; *populate = 0; mmap_assert_write_locked(mm); if (!len) return -EINVAL; /* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec * mounted, in which case we don't add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) return -ENOMEM; /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; /* * addr is returned from get_unmapped_area, * There are two cases: * 1> MAP_FIXED == false * unallocated memory, no need to check sealing. * 1> MAP_FIXED == true * sealing is checked inside mmap_region when * do_vmi_munmap is called. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) pkey = 0; } /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags); if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { struct inode *inode = file_inode(file); unsigned long flags_mask; int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; flags_mask = LEGACY_MAP_MASK; if (file->f_op->fop_flags & FOP_MMAP_SYNC) flags_mask |= MAP_SYNC; switch (flags & MAP_TYPE) { case MAP_SHARED: /* * Force use of MAP_SHARED_VALIDATE with non-legacy * flags. E.g. MAP_SYNC is dangerous to use with * MAP_SHARED as you don't know which consistency model * you will get. We silently ignore unsupported flags * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; if (prot & PROT_WRITE) { if (!(file->f_mode & FMODE_WRITE)) return -EACCES; if (IS_SWAPFILE(file->f_mapping->host)) return -ETXTBSY; } /* * Make sure we don't allow writing to an append-only * file.. */ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; } if (!file->f_op->mmap) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; break; default: return -EINVAL; } /* * Check to see if we are violating any seals and update VMA * flags if necessary to avoid future seal violations. */ err = memfd_check_seals_mmap(file, &vm_flags); if (err) return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_DROPPABLE: if (VM_DROPPABLE == VM_NONE) return -ENOTSUPP; /* * A locked or stack area makes no sense to be droppable. * * Also, since droppable pages can just go away at any time * it makes no sense to copy them on fork or dump them. * * And don't attempt to combine with hugetlb for now. */ if (flags & (MAP_LOCKED | MAP_HUGETLB)) return -EINVAL; if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) return -EINVAL; vm_flags |= VM_DROPPABLE; /* * If the pages can be dropped, then it doesn't make * sense to reserve them. */ vm_flags |= VM_NORESERVE; /* * Likewise, they're volatile enough that they * shouldn't survive forks or coredumps. */ vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; fallthrough; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } /* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */ if (flags & MAP_NORESERVE) { /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; } } else if (flags & MAP_HUGETLB) { struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); return retval; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) { struct mmap_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; if (offset_in_page(a.offset)) return -EINVAL; return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ /* * Determine if the allocation needs to ensure that there is no * existing mapping within it's guard gaps, for use as start_gap. */ static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) { if (vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } /* * Search for an unmapped address range. * * We are looking for a range that: * - does not intersect with any VMA; * - is contained within the [low_limit, high_limit) interval; * - is at least the desired size. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) */ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { unsigned long addr; if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) addr = unmapped_area_topdown(info); else addr = unmapped_area(info); trace_vm_unmapped_area(addr, info); return addr; } /* Get an address range which is currently unmapped. * For shmat() with addr=0. * * Ugly calling convention alert: * Return value with the low bits set means error value, * ie * if (ret & ~PAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. */ unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); return vm_unmapped_area(&info); } #ifndef HAVE_ARCH_UNMAPPED_AREA unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } #endif /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info = {}; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.start_gap = stack_guard_placement(vm_flags); if (filp && is_file_hugepages(filp)) info.align_mask = huge_page_mask_align(filp); addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } return addr; } #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); } #endif unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); } unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ if (len > TASK_SIZE) return -ENOMEM; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. */ get_area = shmem_get_unmapped_area; } /* Always treat pgoff as zero for anonymous memory. */ if (!file) pgoff = 0; if (get_area) { addr = get_area(file, addr, len, pgoff, flags); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); } else { addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, flags, vm_flags); } if (IS_ERR_VALUE(addr)) return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; error = security_mmap_addr(addr); return error ? error : addr; } unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { if (test_bit(MMF_TOPDOWN, &mm->flags)) return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0); return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); } EXPORT_SYMBOL(mm_get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. * @start_addr: The inclusive start user address. * @end_addr: The exclusive end user address. * * Returns: The first VMA within the provided range, %NULL otherwise. Assumes * start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { unsigned long index = start_addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check * @addr: The address * * Returns: The VMA associated with addr, or the next VMA. * May return %NULL in the case of no VMA at addr or above. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { unsigned long index = addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); /** * find_vma_prev() - Find the VMA for a given address, or the next vma and * set %pprev to the previous VMA, if any. * @mm: The mm_struct to check * @addr: The address * @pprev: The pointer to set to the previous VMA * * Note that RCU lock is missing here since the external mmap_lock() is used * instead. * * Returns: The VMA associated with @addr, or the next vma. * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, addr); vma = vma_iter_load(&vmi); *pprev = vma_prev(&vmi); if (!vma) vma = vma_next(&vmi); return vma; } /* enforced gap between the expanding stack and other mappings. */ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; static int __init cmdline_parse_stack_guard_gap(char *p) { unsigned long val; char *endptr; val = simple_strtoul(p, &endptr, 10); if (!*endptr) stack_guard_gap = val << PAGE_SHIFT; return 1; } __setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); #ifdef CONFIG_STACK_GROWSUP int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_upwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; addr &= PAGE_MASK; vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; if (!prev) return NULL; if (expand_stack_locked(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { return expand_downwards(vma, address); } struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; unsigned long start; addr &= PAGE_MASK; vma = find_vma(mm, addr); if (!vma) return NULL; if (vma->vm_start <= addr) return vma; start = vma->vm_start; if (expand_stack_locked(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif #if defined(CONFIG_STACK_GROWSUP) #define vma_expand_up(vma,addr) expand_upwards(vma, addr) #define vma_expand_down(vma, addr) (-EFAULT) #else #define vma_expand_up(vma,addr) (-EFAULT) #define vma_expand_down(vma, addr) expand_downwards(vma, addr) #endif /* * expand_stack(): legacy interface for page faulting. Don't use unless * you have to. * * This is called with the mm locked for reading, drops the lock, takes * the lock for writing, tries to look up a vma again, expands it if * necessary, and downgrades the lock to reading again. * * If no vma is found or it can't be expanded, it returns NULL and has * dropped the lock. */ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma, *prev; mmap_read_unlock(mm); if (mmap_write_lock_killable(mm)) return NULL; vma = find_vma_prev(mm, addr, &prev); if (vma && vma->vm_start <= addr) goto success; if (prev && !vma_expand_up(prev, addr)) { vma = prev; goto success; } if (vma && !vma_expand_down(vma, addr)) goto success; mmap_write_unlock(mm); return NULL; success: mmap_write_downgrade(mm); return vma; } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. * @mm: The mm_struct * @start: The start address to munmap * @len: The length to be munmapped. * @uf: The userfaultfd list_head * * Return: 0 on success, error otherwise. */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { VMA_ITERATOR(vmi, mm, start); return do_vmi_munmap(&vmi, mm, start, len, uf, false); } int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); } EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); return __vm_munmap(addr, len, true); } /* * Emulation of deprecated remap_file_pages() syscall. */ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; vm_flags_t vm_flags; pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) return ret; start = start & PAGE_MASK; size = size & PAGE_MASK; if (start + size <= start) return ret; /* Does pgoff wrap? */ if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; if (mmap_read_lock_killable(mm)) return -EINTR; /* * Look up VMA under read lock first so we can perform the security * without holding locks (which can be problematic). We reacquire a * write lock later and check nothing changed underneath us. */ vma = vma_lookup(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) { mmap_read_unlock(mm); return -EINVAL; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; /* Save vm_flags used to calculate prot and flags, and recheck later. */ vm_flags = vma->vm_flags; file = get_file(vma->vm_file); mmap_read_unlock(mm); /* Call outside mmap_lock to be consistent with other callers. */ ret = security_mmap_file(file, prot, flags); if (ret) { fput(file); return ret; } ret = -EINVAL; /* OK security check passed, take write lock + let it rip. */ if (mmap_write_lock_killable(mm)) { fput(file); return -EINTR; } vma = vma_lookup(mm, start); if (!vma) goto out; /* Make sure things didn't change under us. */ if (vma->vm_flags != vm_flags) goto out; if (vma->vm_file != file) goto out; if (start + size > vma->vm_end) { VMA_ITERATOR(vmi, mm, vma->vm_end); struct vm_area_struct *next, *prev = vma; for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) goto out; if (next->vm_flags != vma->vm_flags) goto out; if (start + size <= next->vm_end) break; prev = next; } if (!next) goto out; } ret = do_mmap(vma->vm_file, start, size, prot, flags, 0, pgoff, &populate, NULL); out: mmap_write_unlock(mm); fput(file); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) ret = 0; return ret; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) return -ENOMEM; if (!len) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; ret = check_brk_limits(addr, len); if (ret) goto limits_failed; ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; vma = vma_prev(&vmi); ret = do_brk_flags(&vmi, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; munmap_failed: limits_failed: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(vm_brk_flags); /* Release all mmaps. */ void exit_mmap(struct mm_struct *mm) { struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; VMA_ITERATOR(vmi, mm, 0); int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); mmap_read_lock(mm); arch_exit_mmap(mm); vma = vma_next(&vmi); if (!vma || unlikely(xa_is_zero(vma))) { /* Can happen if dup_mmap() received an OOM */ mmap_read_unlock(mm); mmap_write_lock(mm); goto destroy; } flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* * Walk the list again, actually closing and freeing it, with preemption * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ vma_iter_set(&vmi, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma_mark_detached(vma); remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); } while (vma && likely(!xa_is_zero(vma))); BUG_ON(count != mm->map_count); trace_exit_mmap(mm); destroy: __mt_destroy(&mm->mm_mt); mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index * are set. But now set the vm_pgoff it will almost certainly * end up with (unless mremap moves it elsewhere before that * first wfault), so /proc/pid/maps tells a consistent story. * * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } if (vma_link(mm, vma)) { if (vma->vm_flags & VM_ACCOUNT) vm_unacct_memory(charged); return -ENOMEM; } return 0; } /* * Return true if the calling process may expand its vm space by the passed * number of pages */ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) { if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) return false; if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { /* Workaround for Valgrind */ if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA), ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); if (!ignore_rlimit_data) return false; } return true; } void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; else if (is_stack_mapping(flags)) mm->stack_vm += npages; else if (is_data_mapping(flags)) mm->data_vm += npages; } static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Close hook, called for unmap() and on the old vma for mremap(). * * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { const struct vm_special_mapping *sm = vma->vm_private_data; if (sm->close) sm->close(sm, vma); } static const char *special_mapping_name(struct vm_area_struct *vma) { return ((struct vm_special_mapping *)vma->vm_private_data)->name; } static int special_mapping_mremap(struct vm_area_struct *new_vma) { struct vm_special_mapping *sm = new_vma->vm_private_data; if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; if (sm->mremap) return sm->mremap(sm, new_vma); return 0; } static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) { /* * Forbid splitting special mappings - kernel has expectations over * the number of pages in mapping. Together with VM_DONTEXPAND * the size of vma should stay the same over the special mapping's * lifetime. */ return -EINVAL; } static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, /* vDSO code relies that VVAR can't be accessed remotely */ .access = NULL, .may_split = special_mapping_split, }; static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; if (*pages) { struct page *page = *pages; get_page(page); vmf->page = page; return 0; } return VM_FAULT_SIGBUS; } static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); vma_set_range(vma, addr, addr + len, 0); vm_flags_init(vma, (vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) goto out; vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); return vma; out: vm_area_free(vma); return ERR_PTR(ret); } bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm) { return vma->vm_private_data == sm && vma->vm_ops == &special_mapping_vmops; } /* * Called with mm->mmap_lock held for writing. * Insert a new vma covering the given region, with the given flags. * Its pages are supplied by the given array of struct page *. * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. * The region past the last page supplied will always produce SIGBUS. * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); } #ifdef CONFIG_SYSCTL #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) int sysctl_legacy_va_layout; #endif static const struct ctl_table mmap_table[] = { { .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) { .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS { .procname = "mmap_rnd_bits", .data = &mmap_rnd_bits, .maxlen = sizeof(mmap_rnd_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_bits_min, .extra2 = (void *)&mmap_rnd_bits_max, }, #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS { .procname = "mmap_rnd_compat_bits", .data = &mmap_rnd_compat_bits, .maxlen = sizeof(mmap_rnd_compat_bits), .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif }; #endif /* CONFIG_SYSCTL */ /* * initialise the percpu counter for VM */ void __init mmap_init(void) { int ret; ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); #ifdef CONFIG_SYSCTL register_sysctl_init("vm", mmap_table); #endif } /* * Initialise sysctl_user_reserve_kbytes. * * This is intended to prevent a user from starting a single memory hogging * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER * mode. * * The default value is min(3% of free memory, 128MB) * 128MB is enough to recover with sshd/login, bash, and top/kill. */ static int init_user_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K); return 0; } subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. * * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin * to log in and kill a memory hogging process. * * Systems with more than 256MB will reserve 8MB, enough to recover * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will * only reserve 3% of free pages by default. */ static int init_admin_reserve(void) { unsigned long free_kbytes; free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K); return 0; } subsys_initcall(init_admin_reserve); /* * Reinititalise user and admin reserves if memory is added or removed. * * The default user reserve max is 128MB, and the default max for the * admin reserve is 8MB. These are usually, but not always, enough to * enable recovery from a memory hogging process using login/sshd, a shell, * and tools like top. It may make sense to increase or even disable the * reserve depending on the existence of swap or variations in the recovery * tools. So, the admin may have changed them. * * If memory is added and the reserves have been eliminated or increased above * the default max, then we'll trust the admin. * * If memory is removed and there isn't enough free memory, then we * need to reset the reserves. * * Otherwise keep the reserve set by the admin. */ static int reserve_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { unsigned long tmp, free_kbytes; switch (action) { case MEM_ONLINE: /* Default max is 128MB. Leave alone if modified by operator. */ tmp = sysctl_user_reserve_kbytes; if (tmp > 0 && tmp < SZ_128K) init_user_reserve(); /* Default max is 8MB. Leave alone if modified by operator. */ tmp = sysctl_admin_reserve_kbytes; if (tmp > 0 && tmp < SZ_8K) init_admin_reserve(); break; case MEM_OFFLINE: free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); pr_info("vm.user_reserve_kbytes reset to %lu\n", sysctl_user_reserve_kbytes); } if (sysctl_admin_reserve_kbytes > free_kbytes) { init_admin_reserve(); pr_info("vm.admin_reserve_kbytes reset to %lu\n", sysctl_admin_reserve_kbytes); } break; default: break; } return NOTIFY_OK; } static int __meminit init_reserve_notifier(void) { if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; } subsys_initcall(init_reserve_notifier); /* * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between * this VMA and its relocated range, which will now reside at [vma->vm_start - * shift, vma->vm_end - shift). * * This function is almost certainly NOT what you want for anything other than * early executable temporary stack relocation. */ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) { /* * The process proceeds as follows: * * 1) Use shift to calculate the new vma endpoints. * 2) Extend vma to cover both the old and new ranges. This ensures the * arguments passed to subsequent functions are consistent. * 3) Move vma's page tables to the new range. * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; unsigned long old_end = vma->vm_end; unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff); struct vm_area_struct *next; struct mmu_gather tlb; PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length); BUG_ON(new_start > new_end); /* * ensure there are no vmas between where we want to go * and where we are */ if (vma != vma_next(&vmi)) return -EFAULT; vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ vmg.middle = vma; if (vma_expand(&vmg)) return -ENOMEM; /* * move the page tables downwards, on failure we rely on * process cleanup to remove whatever mess we made. */ pmc.for_stack = true; if (length != move_page_tables(&pmc)) return -ENOMEM; tlb_gather_mmu(&tlb, mm); next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch * the address space in [new_end, old_start) some architectures * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); vma_prev(&vmi); /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } #ifdef CONFIG_MMU /* * Obtain a read lock on mm->mmap_lock, if the specified address is below the * start of the VMA, the intent is to perform a write, and it is a * downward-growing stack, then attempt to expand the stack to contain it. * * This function is intended only for obtaining an argument page from an ELF * image, and is almost certainly NOT what you want to use for any other * purpose. * * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the * VMA referenced must not be linked in any user-visible tree, i.e. it must be a * new VMA being mapped. * * The function assumes that addr is either contained within the VMA or below * it, and makes no attempt to validate this value beyond that. * * Returns true if the read lock was obtained and a stack was perhaps expanded, * false if the stack expansion failed. * * On stack expansion the function temporarily acquires an mmap write lock * before downgrading it. */ bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *new_vma, unsigned long addr, bool write) { if (!write || addr >= new_vma->vm_start) { mmap_read_lock(mm); return true; } if (!(new_vma->vm_flags & VM_GROWSDOWN)) return false; mmap_write_lock(mm); if (expand_downwards(new_vma, addr)) { mmap_write_unlock(mm); return false; } mmap_write_downgrade(mm); return true; } #else bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write) { return false; } #endif
1754 1757 19 19 1755 1756 1757 448 1757 1 1 966 967 969 246 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/processor.h> #include <linux/context_tracking_irq.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void synchronize_rcu(void); struct rcu_gp_oldstate; unsigned long get_completed_synchronize_rcu(void); void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); // Maximum number of unsigned long values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_OLDSTATE 2 /** * same_state_synchronize_rcu - Are two old-state values identical? * @oldstate1: First old-state value. * @oldstate2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or * get_completed_synchronize_rcu(). Returns @true if the two values are * identical and @false otherwise. This allows structures whose lifetimes * are tracked by old-state values to push these values to a list header, * allowing those structures to be slightly smaller. */ static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2) { return oldstate1 == oldstate2; } #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); preempt_enable(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_LAZY void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func); #else static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } #endif /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else static __always_inline void rcu_irq_work_resched(void) { } #endif #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); #define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #define RCU_NOCB_LOCKDEP_WARN(c, s) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void rcu_tasks_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU // Bits for ->trc_reader_special.b.need_qs field. #define TRC_NEED_QS 0x1 // Task needs a quiescent state. #define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); void rcu_tasks_trace_qs_blkd(struct task_struct *t); # define rcu_tasks_trace_qs(t) \ do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ likely(!___rttq_nesting)) { \ rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs(t); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); void rcu_tasks_rude_torture_stats_print(char *tt, char *tf); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? * * As an accident of implementation, an RCU Tasks Trace grace period also * acts as an RCU grace period. However, this could change at any time. * Code relying on this accident must call this function to verify that * this accident is still happening. * * You have been warned! */ static inline bool rcu_trace_implies_rcu_gp(void) { return true; } /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /** * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states * @old_ts: jiffies at start of processing. * * This helper is for long-running softirq handlers, such as NAPI threads in * networking. The caller should initialize the variable passed in as @old_ts * at the beginning of the softirq handler. When invoked frequently, this macro * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will * provide both RCU and RCU-Tasks quiescent states. Note that this macro * modifies its old_ts argument. * * Because regions of code that have disabled softirq act as RCU read-side * critical sections, this macro should be invoked with softirq (and * preemption) enabled. * * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would * have more chance to invoke schedule() calls and provide necessary quiescent * states. As a contrast, calling cond_resched() only won't achieve the same * effect because cond_resched() does not provide RCU-Tasks quiescent states. */ #define rcu_softirq_qs_periodic(old_ts) \ do { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \ time_after(jiffies, (old_ts) + HZ / 10)) { \ preempt_disable(); \ rcu_softirq_qs(); \ preempt_enable(); \ (old_ts) = jiffies; \ } \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_try_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_try_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } static inline int debug_lockdep_rcu_enabled(void) { return 0; } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message * * This checks debug_lockdep_rcu_enabled() before checking (c) to * prevent early boot splats due to lockdep not yet being initialized, * and rechecks it after checking (c) to prevent false-positive splats * due to races with lockdep being disabled. See commit 3066820034b5dd * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail. */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data..unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && (c) && \ debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else // #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { } #endif // #else // #ifndef CONFIG_PREEMPT_RCU #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) // See RCU_LOCKDEP_WARN() for an explanation of the double call to // debug_lockdep_rcu_enabled(). static inline bool lockdep_assert_rcu_helper(bool c) { return debug_lockdep_rcu_enabled() && (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) && debug_lockdep_rcu_enabled(); } /** * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock() * * Splats if lockdep is enabled and there is no rcu_read_lock() in effect. */ #define lockdep_assert_in_rcu_read_lock() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map))) /** * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh() * * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect. * Note that local_bh_disable() and friends do not suffice here, instead an * actual rcu_read_lock_bh() is required. */ #define lockdep_assert_in_rcu_read_lock_bh() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map))) /** * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched() * * Splats if lockdep is enabled and there is no rcu_read_lock_sched() * in effect. Note that preempt_disable() and friends do not suffice here, * instead an actual rcu_read_lock_sched() is required. */ #define lockdep_assert_in_rcu_read_lock_sched() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map))) /** * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader * * Splats if lockdep is enabled and there is no RCU reader of any * type in effect. Note that regions of code protected by things like * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify * as RCU readers. * * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY * kernels that are not also built with PREEMPT_COUNT. But if you have * lockdep enabled, you might as well also enable PREEMPT_COUNT. */ #define lockdep_assert_in_rcu_reader() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) && \ !lock_is_held(&rcu_bh_lock_map) && \ !lock_is_held(&rcu_sched_lock_map) && \ preemptible())) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) #define lockdep_assert_in_rcu_read_lock() do { } while (0) #define lockdep_assert_in_rcu_read_lock_bh() do { } while (0) #define lockdep_assert_in_rcu_read_lock_sched() do { } while (0) #define lockdep_assert_in_rcu_reader() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __unrcu_pointer(p, local) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)(p); \ rcu_check_sparse(p, __rcu); \ ((typeof(*p) __force __kernel *)(local)); \ }) /** * unrcu_pointer - mark a pointer as not being RCU protected * @p: pointer needing to lose its __rcu property * * Converts @p from an __rcu pointer to a __kernel pointer. * This allows an __rcu pointer to be used with xchg() and friends. */ #define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu)) #define __rcu_access_pointer(p, local, space) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_check(p, local, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_protected(p, local, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define __rcu_dereference_raw(p, local) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) local = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu)) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * Within an RCU read-side critical section, there is little reason to * use rcu_access_pointer(). * * It is usually best to test the rcu_access_pointer() return value * directly in order to avoid accidental dereferences being introduced * by later inattentive changes. In other words, assigning the * rcu_access_pointer() return value to a local variable results in an * accident waiting to happen. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as is * the case in the context of the RCU callback that is freeing up the data, * or after a synchronize_rcu() returns. This can be useful when tearing * down multi-linked structures after a grace period has elapsed. However, * rcu_dereference_protected() is normally preferred for this use case. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). However, * please note that starting in v5.0 kernels, vanilla RCU grace periods * wait for local_bh_disable() regions of code in addition to regions of * code demarked by rcu_read_lock() and rcu_read_unlock(). This means * that synchronize_rcu(), call_rcu, and friends all take not only * rcu_read_lock() but also rcu_read_lock_bh() into account. */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). * However, please note that starting in v5.0 kernels, vanilla RCU grace * periods wait for preempt_disable() regions of code in addition to * regions of code demarked by rcu_read_lock() and rcu_read_unlock(). * This means that synchronize_rcu(), call_rcu, and friends all take not * only rcu_read_lock() but also rcu_read_lock_sched() into account. */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_sched_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * Both synchronize_rcu() and call_rcu() also wait for regions of code * with preemption disabled, including regions of code with interrupts or * softirqs disabled. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. * This deadlock immunity also extends to the scheduler's runqueue * and priority-inheritance spinlocks, courtesy of the quiescent-state * deferral that is carried out when rcu_read_unlock() is invoked with * interrupts disabled. * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ __release(RCU); __rcu_read_unlock(); } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent to rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as an RCU * read-side critical section. However, please note that this equivalence * applies only to v5.0 and later. Before v5.0, rcu_read_lock() and * rcu_read_lock_bh() were unrelated. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent to rcu_read_lock(), but also disables preemption. * Read-side critical sections can also be introduced by anything else that * disables preemption, including local_irq_disable() and friends. However, * please note that the equivalence to rcu_read_lock() applies only to * v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched() * were unrelated. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. In order to have a universal * callback function handling different offsets of rcu_head, the callback needs * to determine the starting address of the freed object, which can be a large * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * The object to be freed can be allocated either by kmalloc() or * kmem_cache_alloc(). * * Note that the allowable offset might decrease in the future. * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) #define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) /** * kfree_rcu_mightsleep() - kfree an object after a grace period. * @ptr: pointer to kfree for single-argument invocations. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kfree_rcu_mightsleep(ptr); * * where @ptr is the pointer to be freed by kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) /* * In mm/slab_common.c, no suitable header to include here. */ void kvfree_call_rcu(struct rcu_head *head, void *ptr); /* * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the * comment of kfree_rcu() for details. */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) { \ BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (void *) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; DEFINE_LOCK_GUARD_0(rcu, do { rcu_read_lock(); /* * sparse doesn't call the cleanup function, * so just release immediately and don't track * the context. We don't need to anyway, since * the whole point of the guard is to not need * the explicit unlock. */ __release(RCU); } while (0), rcu_read_unlock()) #endif /* __LINUX_RCUPDATE_H */
738 739 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch);
286 286 286 286 286 286 286 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <linux/atomic.h> #include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> #include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/skmsg.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> #include <linux/btf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> #include <net/net_namespace.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> #include <net/lwtunnel.h> #include <net/ipv6_stubs.h> #include <net/bpf_sk_storage.h> #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> #include <net/xdp.h> #include <net/mptcp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netkit.h> #include <linux/un.h> #include <net/xdp_sock_drv.h> #include <net/inet_dscp.h> #include "dev.h" /* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { if (in_compat_syscall()) { struct compat_sock_fprog f32; if (len != sizeof(f32)) return -EINVAL; if (copy_from_sockptr(&f32, src, sizeof(f32))) return -EFAULT; memset(dst, 0, sizeof(*dst)); dst->len = f32.len; dst->filter = compat_ptr(f32.filter); } else { if (len != sizeof(*dst)) return -EINVAL; if (copy_from_sockptr(dst, src, sizeof(*dst))) return -EFAULT; } return 0; } EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); /** * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; err = security_sock_rcv_skb(sk, skb); if (err) return err; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(sk_filter_trim_cap); BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[a]; if (!nla_ok(nla, skb->len - a)) return 0; nla = nla_find_nested(nla, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) { if (likely(offset >= 0)) return offset; if (offset >= SKF_NET_OFF) return offset - SKF_NET_OFF + skb_network_offset(skb); if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) return offset - SKF_LL_OFF + skb_mac_offset(skb); return INT_MIN; } BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u8 tmp; const int len = sizeof(tmp); offset = bpf_skb_load_helper_convert_offset(skb, offset); if (offset == INT_MIN) return -EFAULT; if (headlen - offset >= len) return *(u8 *)(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return tmp; else return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be16 tmp; const int len = sizeof(tmp); offset = bpf_skb_load_helper_convert_offset(skb, offset); if (offset == INT_MIN) return -EFAULT; if (headlen - offset >= len) return get_unaligned_be16(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be16_to_cpu(tmp); else return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { __be32 tmp; const int len = sizeof(tmp); offset = bpf_skb_load_helper_convert_offset(skb, offset); if (offset == INT_MIN) return -EFAULT; if (headlen - offset >= len) return get_unaligned_be32(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be32_to_cpu(tmp); else return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, offset); } static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; switch (skb_field) { case SKF_AD_MARK: BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, mark)); break; case SKF_AD_PKTTYPE: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); #endif break; case SKF_AD_QUEUE: BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; case SKF_AD_VLAN_TAG: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, vlan_all)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); break; } return insn - insn_buf; } static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); /* A = *(u16 *) (CTX + offsetof(protocol)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, protocol)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PKTTYPE: cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); *insn++ = BPF_EXIT_INSN(); if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, ifindex)); else *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, type)); break; case SKF_AD_OFF + SKF_AD_MARK: cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, hash)); break; case SKF_AD_OFF + SKF_AD_QUEUE: cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: cnt = convert_skb_access(SKF_AD_VLAN_TAG, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TPID: BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, vlan_proto)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: case SKF_AD_OFF + SKF_AD_CPU: case SKF_AD_OFF + SKF_AD_RANDOM: /* arg1 = CTX */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); /* arg2 = A */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); /* arg3 = X */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); bpf_user_rnd_init_once(); break; } break; case SKF_AD_OFF + SKF_AD_ALU_XOR_X: /* A ^= X */ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); break; default: /* This is just a dummy call to avoid letting the compiler * evict __bpf_call_base() as an optimization. Placed here * where no-one bothers. */ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); return false; } *insnp = insn; return true; } static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) { const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); bool endian = BPF_SIZE(fp->code) == BPF_H || BPF_SIZE(fp->code) == BPF_W; bool indirect = BPF_MODE(fp->code) == BPF_IND; const int ip_align = NET_IP_ALIGN; struct bpf_insn *insn = *insnp; int offset = fp->k; if (!indirect && ((unaligned_ok && offset >= 0) || (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); if (offset) *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_TMP, 0); } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); } *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); if (fp->k) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); } switch (BPF_SIZE(fp->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); break; default: return false; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn = BPF_EXIT_INSN(); *insnp = insn; return true; } /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_prog *new_prog, int *new_len, bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } do_pass: new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); if (*seen_ld_abs) { /* For packet access in classic BPF, cache skb->data * in callee-saved BPF R8 and skb->len - skb->data_len * (headlen) in BPF R9. Since classic BPF is read-only * on CTX, we only need to cache it once. */ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), BPF_REG_D, BPF_REG_CTX, offsetof(struct sk_buff, data)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, offsetof(struct sk_buff, len)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, data_len)); *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_MOD | BPF_K: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_ABS | BPF_W: case BPF_LD | BPF_ABS | BPF_H: case BPF_LD | BPF_ABS | BPF_B: case BPF_LD | BPF_IND | BPF_W: case BPF_LD | BPF_IND | BPF_H: case BPF_LD | BPF_IND | BPF_B: /* Check for overloaded BPF extension and * directly convert it if found, otherwise * just move on with mapping. */ if (BPF_CLASS(fp->code) == BPF_LD && BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; if (BPF_CLASS(fp->code) == BPF_LD && convert_bpf_ld_abs(fp, &insn)) { *seen_ld_abs = true; break; } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); /* Error with exception code on div/mod by 0. * For cBPF programs, this was always return 0. */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn++ = BPF_EXIT_INSN(); } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; /* Jump transformation cannot use BPF block macros * everywhere as offset calculation and target updates * require a bit more work than the rest, i.e. jump * opcodes map as-is, but offsets need adjustment. */ #define BPF_EMIT_JMP \ do { \ const s32 off_min = S16_MIN, off_max = S16_MAX; \ s32 off; \ \ if (target >= len || target < 0) \ goto err; \ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ off -= insn - tmp_insns; \ /* Reject anything not fitting into insn->off. */ \ if (off < off_min || off > off_max) \ goto err; \ insn->off = off; \ } while (0) case BPF_JMP | BPF_JA: target = i + fp->k + 1; insn->code = fp->code; BPF_EMIT_JMP; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { /* BPF immediates are signed, zero extend * immediate into tmp register and use it * in compare insn. */ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); insn->dst_reg = BPF_REG_A; insn->src_reg = BPF_REG_TMP; bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ if (fp->jf == 0) { insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; target = i + fp->jt + 1; BPF_EMIT_JMP; break; } /* Convert some jumps when 'jump_true' is next insn. */ if (fp->jt == 0) { switch (BPF_OP(fp->code)) { case BPF_JEQ: insn->code = BPF_JMP | BPF_JNE | bpf_src; break; case BPF_JGT: insn->code = BPF_JMP | BPF_JLE | bpf_src; break; case BPF_JGE: insn->code = BPF_JMP | BPF_JLT | bpf_src; break; default: goto jmp_rest; } target = i + fp->jf + 1; BPF_EMIT_JMP; break; } jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; BPF_EMIT_JMP; insn++; insn->code = BPF_JMP | BPF_JA; target = i + fp->jf + 1; BPF_EMIT_JMP; break; /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, .k = fp->k, }; *seen_ld_abs = true; /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ convert_bpf_ld_abs(&tmp, &insn); insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); /* tmp = X */ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: if (BPF_RVAL(fp->code) == BPF_K) *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 0, fp->k); *insn = BPF_EXIT_INSN(); break; /* Store to stack. */ case BPF_ST: case BPF_STX: stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, -stack_off); /* check_load_and_stores() verifies that classic BPF can * load from stack only after write, so tracking * stack_depth for ST|STX insns is enough */ if (new_prog && new_prog->aux->stack_depth < stack_off) new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, -stack_off); break; /* A = K or X = K */ case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, fp->k); break; /* X = A */ case BPF_MISC | BPF_TAX: *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); break; /* A = X */ case BPF_MISC | BPF_TXA: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); break; /* A = skb->len or X = skb->len */ case BPF_LD | BPF_W | BPF_LEN: case BPF_LDX | BPF_W | BPF_LEN: *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_CTX, offsetof(struct sk_buff, len)); break; /* Access seccomp_data fields. */ case BPF_LDX | BPF_ABS | BPF_W: /* A = *(u32 *) (ctx + K) */ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); break; /* Unknown instruction. */ default: goto err; } insn++; if (new_prog) memcpy(new_insn, tmp_insns, sizeof(*insn) * (insn - tmp_insns)); new_insn += insn - tmp_insns; } if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; if (*seen_ld_abs) *new_len += 4; /* Prologue bits. */ return 0; } pass++; if (new_flen != new_insn - first_insn) { new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; } kfree(addrs); BUG_ON(*new_len != new_flen); return 0; err: kfree(addrs); return -EINVAL; } /* Security: * * As we dont want to clear mem[] array for each packet going through * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { case BPF_ST: case BPF_STX: memvalid |= (1 << filter[pc].k); break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; case BPF_JMP | BPF_JA: /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; break; } } error: kfree(masks); return ret; } static bool chk_code_allowed(u16 code_to_probe) { static const bool codes[] = { /* 32 bit ALU operations */ [BPF_ALU | BPF_ADD | BPF_K] = true, [BPF_ALU | BPF_ADD | BPF_X] = true, [BPF_ALU | BPF_SUB | BPF_K] = true, [BPF_ALU | BPF_SUB | BPF_X] = true, [BPF_ALU | BPF_MUL | BPF_K] = true, [BPF_ALU | BPF_MUL | BPF_X] = true, [BPF_ALU | BPF_DIV | BPF_K] = true, [BPF_ALU | BPF_DIV | BPF_X] = true, [BPF_ALU | BPF_MOD | BPF_K] = true, [BPF_ALU | BPF_MOD | BPF_X] = true, [BPF_ALU | BPF_AND | BPF_K] = true, [BPF_ALU | BPF_AND | BPF_X] = true, [BPF_ALU | BPF_OR | BPF_K] = true, [BPF_ALU | BPF_OR | BPF_X] = true, [BPF_ALU | BPF_XOR | BPF_K] = true, [BPF_ALU | BPF_XOR | BPF_X] = true, [BPF_ALU | BPF_LSH | BPF_K] = true, [BPF_ALU | BPF_LSH | BPF_X] = true, [BPF_ALU | BPF_RSH | BPF_K] = true, [BPF_ALU | BPF_RSH | BPF_X] = true, [BPF_ALU | BPF_NEG] = true, /* Load instructions */ [BPF_LD | BPF_W | BPF_ABS] = true, [BPF_LD | BPF_H | BPF_ABS] = true, [BPF_LD | BPF_B | BPF_ABS] = true, [BPF_LD | BPF_W | BPF_LEN] = true, [BPF_LD | BPF_W | BPF_IND] = true, [BPF_LD | BPF_H | BPF_IND] = true, [BPF_LD | BPF_B | BPF_IND] = true, [BPF_LD | BPF_IMM] = true, [BPF_LD | BPF_MEM] = true, [BPF_LDX | BPF_W | BPF_LEN] = true, [BPF_LDX | BPF_B | BPF_MSH] = true, [BPF_LDX | BPF_IMM] = true, [BPF_LDX | BPF_MEM] = true, /* Store instructions */ [BPF_ST] = true, [BPF_STX] = true, /* Misc instructions */ [BPF_MISC | BPF_TAX] = true, [BPF_MISC | BPF_TXA] = true, /* Return instructions */ [BPF_RET | BPF_K] = true, [BPF_RET | BPF_A] = true, /* Jump instructions */ [BPF_JMP | BPF_JA] = true, [BPF_JMP | BPF_JEQ | BPF_K] = true, [BPF_JMP | BPF_JEQ | BPF_X] = true, [BPF_JMP | BPF_JGE | BPF_K] = true, [BPF_JMP | BPF_JGE | BPF_X] = true, [BPF_JMP | BPF_JGT | BPF_K] = true, [BPF_JMP | BPF_JGT | BPF_X] = true, [BPF_JMP | BPF_JSET | BPF_K] = true, [BPF_JMP | BPF_JSET | BPF_X] = true, }; if (code_to_probe >= ARRAY_SIZE(codes)) return false; return codes[code_to_probe]; } static bool bpf_check_basics_ok(const struct sock_filter *filter, unsigned int flen) { if (filter == NULL) return false; if (flen == 0 || flen > BPF_MAXINSNS) return false; return true; } /** * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain * no references or jumps that are out of range, no illegal * instructions, and must end with a RET instruction. * * All jumps are forward as they are not signed. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) return -EINVAL; /* Some instructions need special checks */ switch (ftest->code) { case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; break; case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: if (ftest->k >= 32) return -EINVAL; break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; case BPF_JMP | BPF_JA: /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: anc_found = false; if (bpf_anc_helper(ftest) & BPF_ANC) anc_found = true; /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } } /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { case BPF_RET | BPF_K: case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } return -EINVAL; } static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); if (!fp->orig_prog) return -ENOMEM; fkprog = fp->orig_prog; fkprog->len = fprog->len; fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; } return 0; } static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; if (fprog) { kfree(fprog->filter); kfree(fprog); } } static void __bpf_prog_release(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); bpf_prog_free(prog); } } static void __sk_filter_release(struct sk_filter *fp) { __bpf_prog_release(fp->prog); kfree(fp); } /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); __sk_filter_release(fp); } /** * sk_filter_release - release a socket filter * @fp: filter to remove * * Remove a filter from a socket and release its resources. */ static void sk_filter_release(struct sk_filter *fp) { if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); } /* try to charge the socket memory if there is space available * return true on success */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { if (!refcount_inc_not_zero(&fp->refcnt)) return false; if (!__sk_filter_charge(sk, fp)) { sk_filter_release(fp); return false; } return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it won't be used at * this point in time anymore internally after the migration to the eBPF * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; } /* 1st pass: calculate the new program length. */ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs); if (err) goto out_err_free; /* Expand fp for appending the new filter representation. */ old_fp = fp; fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. */ fp = old_fp; err = -ENOMEM; goto out_err_free; } fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = bpf_convert_filter(old_prog, old_len, fp, &new_len, &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released * by krealloc(). */ goto out_err_free; fp = bpf_prog_select_runtime(fp, &err); if (err) goto out_err_free; kfree(old_prog); return fp; out_err_free: kfree(old_prog); out_err: __bpf_prog_release(fp); return ERR_PTR(err); } static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans) { int err; fp->bpf_func = NULL; fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } /* There might be additional checks and transformations * needed on classic filters, f.e. in case of seccomp. */ if (trans) { err = trans(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } } /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ bpf_jit_compile(fp); /* JIT compiler couldn't process this filter, so do the eBPF translation * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); return fp; } /** * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold * a copy here, and can spare us the work. */ fp->orig_prog = NULL; /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create); /** * bpf_prog_create_from_user - create an unattached filter from user buffer * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; int err; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { __bpf_prog_free(fp); return -EFAULT; } fp->len = fprog->len; fp->orig_prog = NULL; if (save_orig) { err = bpf_prog_store_orig_filter(fp, fprog); if (err) { __bpf_prog_free(fp); return -ENOMEM; } } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, trans); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); } EXPORT_SYMBOL_GPL(bpf_prog_destroy); static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; fp = kmalloc(sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); return ERR_PTR(-EFAULT); } prog->len = fprog->len; err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ return bpf_prepare_filter(prog, NULL); } /** * sk_attach_filter - attach a socket filter * @fprog: the filter program * @sk: the socket to use * * Attach the user's filter code. We first run some sanity checks on * it to make sure it does not explode on us later. If an error * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; } return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err, optmem_max; if (IS_ERR(prog)) return PTR_ERR(prog); optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); if (err) __bpf_prog_release(prog); return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0; } int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; int err, optmem_max; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); if (PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { /* Like other non BPF_PROG_TYPE_SOCKET_FILTER * bpf prog (e.g. sockmap). It depends on the * limitation imposed by bpf_prog_load(). * Hence, sysctl_optmem_max is not checked. */ if ((sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) || (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_TCP) || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { err = -ENOTSUPP; goto err_prog_put; } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); if (bpf_prog_size(prog->len) > optmem_max) { err = -ENOMEM; goto err_prog_put; } } err = reuseport_attach_prog(sk, prog); err_prog_put: if (err) bpf_prog_put(prog); return err; } void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) bpf_prog_put(prog); else bpf_prog_destroy(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { #ifdef CONFIG_DEBUG_NET /* Avoid a splat in pskb_may_pull_reason() */ if (write_len > INT_MAX) return -EINVAL; #endif return skb_ensure_writable(skb, write_len); } static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { int err = __bpf_try_make_writable(skb, write_len); bpf_compute_data_pointers(skb); return err; } static int bpf_try_make_head_writable(struct sk_buff *skb) { return bpf_try_make_writable(skb, skb_headlen(skb)); } static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); } static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len, u64, flags) { void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb->data + offset; if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpush_rcsum(skb, ptr, len, offset); if (flags & BPF_F_INVALIDATE_HASH) skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return ____bpf_skb_store_bytes(skb, offset, from, len, flags); } BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return ____bpf_skb_load_bytes(skb, offset, to, len); } BPF_CALL_4(bpf_flow_dissector_load_bytes, const struct bpf_flow_dissector *, ctx, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > 0xffff)) goto err_clear; if (unlikely(!ctx->skb)) goto err_clear; ptr = skb_header_pointer(ctx->skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { .func = bpf_flow_dissector_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); u8 *start, *ptr; if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: if (unlikely(!skb_mac_header_was_set(skb))) goto err_clear; start = skb_mac_header(skb); break; case BPF_HDR_START_NET: start = skb_network_header(skb); break; default: goto err_clear; } ptr = start + offset; if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { .func = bpf_skb_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto bpf_skb_pull_data_proto = { .func = bpf_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) { return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; } static const struct bpf_func_proto bpf_sk_fullsock_proto = { .func = bpf_sk_fullsock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto sk_skb_pull_data_proto = { .func = sk_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; csum_replace_by_diff(ptr, to); break; case 2: csum_replace2(ptr, from, to); break; case 4: csum_replace4(ptr, from, to); break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; case 4: inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); break; default: return -EINVAL; } if (is_mmzero && !*ptr) *ptr = CSUM_MANGLED_0; return 0; } static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data * from_size > 0, to_size == 0, seed := csum --> pulling data * from_size > 0, to_size > 0, seed := 0 --> diffing data * * Even for diffing, from_size and to_size don't need to be equal. */ __wsum ret = seed; if (from_size && to_size) ret = csum_sub(csum_partial(to, to_size, ret), csum_partial(from, from_size, 0)); else if (to_size) ret = csum_partial(to, to_size, ret); else if (from_size) ret = ~csum_partial(from, from_size, ~ret); return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) { /* The interface is to be used in combination with bpf_csum_diff() * for direct packet writes. csum rotation for alignment as well * as emulating csum_sub() can be done from the eBPF program. */ if (skb->ip_summed == CHECKSUM_COMPLETE) return (skb->csum = csum_add(skb->csum, csum)); return -ENOTSUPP; } static const struct bpf_func_proto bpf_csum_update_proto = { .func = bpf_csum_update, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) { /* The interface is to be used in combination with bpf_skb_adjust_room() * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET * is passed as flags, for example. */ switch (level) { case BPF_CSUM_LEVEL_INC: __skb_incr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_DEC: __skb_decr_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_RESET: __skb_reset_checksum_unnecessary(skb); break; case BPF_CSUM_LEVEL_QUERY: return skb->ip_summed == CHECKSUM_UNNECESSARY ? skb->csum_level : -EACCES; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_csum_level_proto = { .func = bpf_csum_level, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb_nomtu(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { int ret = ____dev_forward_skb(dev, skb, false); if (likely(!ret)) { skb->dev = dev; ret = netif_rx(skb); } return ret; } static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; } skb->dev = dev; skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); dev_xmit_recursion_dec(); return ret; } static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, u32 flags) { unsigned int mlen = skb_network_offset(skb); if (unlikely(skb->len <= mlen)) { kfree_skb(skb); return -ERANGE; } if (mlen) { __skb_pull(skb, mlen); /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that * the skb is originated from ingress (i.e. a forwarded skb) * to ensure that rcsum starts at net header. */ if (!skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); } skb_pop_mac_header(skb); skb_reset_mac_len(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); else return __bpf_redirect_no_mac(skb, dev, flags); } #if IS_ENABLED(CONFIG_IPV6) static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { dst = skb_dst(skb); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); } else { nexthop = &nh->ipv6_nh; } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, false); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_flags = FLOWI_FLAG_ANYSRC, .flowi6_mark = skb->mark, .flowlabel = ip6_flowinfo(ip6h), .flowi6_oif = dev->ifindex, .flowi6_proto = ip6h->nexthdr, .daddr = ip6h->daddr, .saddr = ip6h->saddr, }; dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst)) goto out_drop; skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; } err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); goto out_drop; } skb->dev = dev; skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } rcu_read_lock(); if (!nh) { struct rtable *rt = skb_rtable(skb); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); } else if (nh->nh_family == AF_INET6) { neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); is_v6gw = true; } else if (nh->nh_family == AF_INET) { neigh = ip_neigh_gw4(dev, nh->ipv4_nh); } else { rcu_read_unlock(); goto out_drop; } if (likely(!IS_ERR(neigh))) { int ret; sock_confirm_neigh(skb, neigh); local_bh_disable(); dev_xmit_recursion_inc(); ret = neigh_output(neigh, skb, is_v6gw); dev_xmit_recursion_dec(); local_bh_enable(); rcu_read_unlock(); return ret; } rcu_read_unlock(); out_drop: kfree_skb(skb); return -ENETDOWN; } static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; if (!nh) { struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, .saddr = ip4h->saddr, }; struct rtable *rt; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto out_drop; if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); goto out_drop; } skb_dst_set(skb, &rt->dst); } err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) DEV_STATS_INC(dev, tx_errors); else ret = NET_XMIT_SUCCESS; goto out_xmit; out_drop: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); out_xmit: return ret; } #else static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); if (unlikely(skb->mac_header >= skb->network_header)) goto out; bpf_push_mac_rcsum(skb); if (is_multicast_ether_addr(ethh->h_dest)) goto out; skb_pull(skb, sizeof(*ethh)); skb_unset_mac_header(skb); skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; } /* Internal, non-exposed redirect flags. */ enum { BPF_F_NEIGH = (1ULL << 16), BPF_F_PEER = (1ULL << 17), BPF_F_NEXTHOP = (1ULL << 18), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; clone = skb_clone(skb, GFP_ATOMIC); if (unlikely(!clone)) return -ENOMEM; /* For direct write, we need to keep the invariant that the skbs * we're dealing with need to be uncloned. Should uncloning fail * here, we need to free the just generated clone to unclone once * again. */ ret = bpf_try_make_head_writable(skb); if (unlikely(ret)) { kfree_skb(clone); return -ENOMEM; } return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static struct net_device *skb_get_peer_dev(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; if (likely(ops->ndo_get_peer_dev)) return INDIRECT_CALL_1(ops->ndo_get_peer_dev, netkit_peer_dev, dev); return NULL; } int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; ri->flags = 0; if (unlikely(!dev)) goto out_drop; if (flags & BPF_F_PEER) { if (unlikely(!skb_at_tc_ingress(skb))) goto out_drop; dev = skb_get_peer_dev(dev); if (unlikely(!dev || !(dev->flags & IFF_UP) || net_eq(net, dev_net(dev)))) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return -EAGAIN; } return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return TC_ACT_SHOT; ri->flags = flags; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return TC_ACT_SHOT; ri->flags = BPF_F_PEER; ri->tgt_index = ifindex; return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_peer_proto = { .func = bpf_redirect_peer, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, int, plen, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); if (plen) memcpy(&ri->nh, params, sizeof(ri->nh)); return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_neigh_proto = { .func = bpf_redirect_neigh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; } static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .func = bpf_msg_apply_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; } static void sk_msg_reset_curr(struct sk_msg *msg) { if (!msg->sg.size) { msg->sg.curr = msg->sg.start; msg->sg.copybreak = 0; } else { u32 i = msg->sg.end; sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, u32, end, u64, flags) { u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; u32 first_sge, last_sge, i, shift, bytes_sg_total; struct scatterlist *sge; u8 *raw, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += len; len = sk_msg_elem(msg, i)->length; if (start < offset + len) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (unlikely(start >= offset + len)) return -EINVAL; first_sge = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then * place the buffer in the scatterlist and fixup the original * entries by removing the entries now in the linear buffer * and shifting the remaining entries. For now we do not try * to copy partial entries to avoid complexity of running out * of sg_entry slots. The downside is reading a single byte * will copy the entire sg entry. */ do { copy += sk_msg_elem(msg, i)->length; sk_msg_iter_var_next(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg.end); last_sge = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy)); if (unlikely(!page)) return -ENOMEM; raw = page_address(page); i = first_sge; do { sge = sk_msg_elem(msg, i); from = sg_virt(sge); len = sge->length; to = raw + poffset; memcpy(to, from, len); poffset += len; sge->length = 0; put_page(sg_page(sge)); sk_msg_iter_var_next(i); } while (i != last_sge); sg_set_page(&msg->sg.data[first_sge], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ WARN_ON_ONCE(last_sge == first_sge); shift = last_sge > first_sge ? last_sge - first_sge - 1 : NR_MSG_FRAG_IDS - first_sge + last_sge - 1; if (!shift) goto out; i = first_sge; sk_msg_iter_var_next(i); do { u32 move_from; if (i + shift >= NR_MSG_FRAG_IDS) move_from = i + shift - NR_MSG_FRAG_IDS; else move_from = i + shift; if (move_from == msg->sg.end) break; msg->sg.data[i] = msg->sg.data[move_from]; msg->sg.data[move_from].length = 0; msg->sg.data[move_from].page_link = 0; msg->sg.data[move_from].offset = 0; sk_msg_iter_var_next(i); } while (1); msg->sg.end = msg->sg.end - shift > msg->sg.end ? msg->sg.end - shift + NR_MSG_FRAG_IDS : msg->sg.end - shift; out: sk_msg_reset_curr(msg); msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; msg->data_end = msg->data + bytes; return 0; } static const struct bpf_func_proto bpf_msg_pull_data_proto = { .func = bpf_msg_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; u32 new, i = 0, l = 0, space, copy = 0, offset = 0; u8 *raw, *to, *from; struct page *page; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); /* If no space available will fallback to copy, we need at * least one scatterlist elem available to push data into * when start aligns to the beginning of an element or two * when it falls inside an element. We handle the start equals * offset case because its the common case for inserting a * header. */ if (!space || (space == 1 && start != offset)) copy = msg->sg.data[i].length; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy + len)); if (unlikely(!page)) return -ENOMEM; if (copy) { int front, back; raw = page_address(page); if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; from = sg_virt(psge); if (front) memcpy(raw, from, front); if (back) { from += front; to = raw + front + len; memcpy(to, from, back); } put_page(sg_page(psge)); new = i; goto place_new; } if (start - offset) { if (i == msg->sg.end) sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); psge->length = start - offset; rsge.length -= psge->length; rsge.offset += start; sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); } /* Slot(s) to place newly allocated data */ sk_msg_iter_next(msg, end); new = i; sk_msg_iter_var_next(i); if (i == msg->sg.end) { if (!rsge.length) goto place_new; sk_msg_iter_next(msg, end); goto place_new; } /* Shift one or two slots as needed */ sge = sk_msg_elem_cpy(msg, new); sg_unmark_end(&sge); nsge = sk_msg_elem_cpy(msg, i); if (rsge.length) { sk_msg_iter_var_next(i); nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); } while (i != msg->sg.end) { msg->sg.data[i] = sge; sge = nsge; sk_msg_iter_var_next(i); if (rsge.length) { nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); } else { nsge = sk_msg_elem_cpy(msg, i); } } place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); sk_msg_iter_var_next(new); msg->sg.data[new] = rsge; } sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_push_data_proto = { .func = bpf_msg_push_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static void sk_msg_shift_left(struct sk_msg *msg, int i) { struct scatterlist *sge = sk_msg_elem(msg, i); int prev; put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); msg->sg.data[prev] = msg->sg.data[i]; } while (i != msg->sg.end); sk_msg_iter_prev(msg, end); } static void sk_msg_shift_right(struct sk_msg *msg, int i) { struct scatterlist tmp, sge; sk_msg_iter_next(msg, end); sge = sk_msg_elem_cpy(msg, i); sk_msg_iter_var_next(i); tmp = sk_msg_elem_cpy(msg, i); while (i != msg->sg.end) { msg->sg.data[i] = sge; sk_msg_iter_var_next(i); sge = tmp; tmp = sk_msg_elem_cpy(msg, i); } } BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, u32, len, u64, flags) { u32 i = 0, l = 0, space, offset = 0; u64 last = start + len; int pop; if (unlikely(flags)) return -EINVAL; if (unlikely(len == 0)) return 0; /* First find the starting scatterlist element */ i = msg->sg.start; do { offset += l; l = sk_msg_elem(msg, i)->length; if (start < offset + l) break; sk_msg_iter_var_next(i); } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); pop = len; /* --------------| offset * -| start |-------- len -------| * * |----- a ----|-------- pop -------|----- b ----| * |______________________________________________| length * * * a: region at front of scatter element to save * b: region at back of scatter element to save when length > A + pop * pop: region to pop from element, same as input 'pop' here will be * decremented below per iteration. * * Two top-level cases to handle when start != offset, first B is non * zero and second B is zero corresponding to when a pop includes more * than one element. * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); nsge = sk_msg_elem(msg, i); get_page(sg_page(sge)); sg_set_page(nsge, sg_page(sge), b, sge->offset + pop + a); } else { struct page *page, *orig; u8 *to, *from; page = alloc_pages(__GFP_NOWARN | __GFP_COMP | GFP_ATOMIC, get_order(a + b)); if (unlikely(!page)) return -ENOMEM; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); memcpy(to, from, a); memcpy(to + a, from + a + pop, b); sg_set_page(sge, page, a + b, 0); put_page(orig); } pop = 0; } else { pop -= (sge->length - a); sge->length = a; } } /* From above the current layout _must_ be as follows, * * -| offset * -| start * * |---- pop ---|---------------- b ------------| * |____________________________________________| length * * Offset and start of the current msg elem are equal because in the * previous case we handled offset != start and either consumed the * entire element and advanced to the next element OR pop == 0. * * Two cases to handle here are first pop is less than the length * leaving some remainder b above. Simply adjust the element's layout * in this case. Or pop >= length of the element so that b = 0. In this * case advance to next element decrementing pop. */ while (pop) { struct scatterlist *sge = sk_msg_elem(msg, i); if (pop < sge->length) { sge->length -= pop; sge->offset += pop; pop = 0; } else { pop -= sge->length; sk_msg_shift_left(msg, i); } } sk_mem_uncharge(msg->sk, len - pop); msg->sg.size -= (len - pop); sk_msg_reset_curr(msg); sk_msg_compute_data_pointers(msg); return 0; } static const struct bpf_func_proto bpf_msg_pop_data_proto = { .func = bpf_msg_pop_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; #ifdef CONFIG_CGROUP_NET_CLASSID BPF_CALL_0(bpf_get_cgroup_classid_curr) { return __task_get_classid(current); } const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, }; BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return 0; return sock_cgroup_classid(&sk->sk_cgrp_data); } static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { .func = bpf_skb_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .func = bpf_get_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { .func = bpf_get_route_realm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .func = bpf_get_hash_recalc, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) { /* After all direct packet write, this can be used once for * triggering a lazy recalc on next skb_get_hash() invocation. */ skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .func = bpf_set_hash_invalid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) { /* Set user specified hash as L4(+), so that it gets returned * on skb_get_hash() call unless BPF prog later on triggers a * skb_clear_hash(). */ __skb_set_sw_hash(skb, hash, true); return 0; } static const struct bpf_func_proto bpf_set_hash_proto = { .func = bpf_set_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); skb_reset_mac_len(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { int ret; bpf_push_mac_rcsum(skb); ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with len as headroom, * so no need to do it here. */ skb_push(skb, len); memmove(skb->data, skb->data + len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) * needed here as it does not change the skb->csum * result for checksum complete when summing over * zeroed blocks. */ return 0; } static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { void *old_data; /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); memmove(skb->data, old_data, off); return 0; } static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* There's no need for __skb_push()/__skb_pull() pair to * get to the start of the mac header as we're guaranteed * to always start from here under eBPF. */ ret = bpf_skb_generic_push(skb, off, len); if (likely(!ret)) { skb->mac_header -= len; skb->network_header -= len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* Same here, __skb_push()/__skb_pull() pair not needed. */ ret = bpf_skb_generic_pop(skb, off, len); if (likely(!ret)) { skb->mac_header += len; skb->network_header += len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } } skb->protocol = htons(ETH_P_IPV6); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } } skb->protocol = htons(ETH_P_IP); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb->protocol; if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, u64, flags) { int ret; if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork * needed for changing the protocol, and eBPF program fills the * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() * and other helpers, rather than passing a raw buffer here. * * The rationale is to keep this minimal and without a need to * deal with raw packet data. F.e. even if we would pass buffers * here, the program still needs to call the bpf_lX_csum_replace() * helpers anyway. Plus, this way we keep also separation of * concerns, since f.e. bpf_skb_store_bytes() should only take * care of stores. * * Currently, additional options and extension header space are * not supported, but flags register is reserved so we can adapt * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_proto_proto = { .func = bpf_skb_change_proto, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; return 0; } static const struct bpf_func_proto bpf_skb_change_type_proto = { .func = bpf_skb_change_type, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static u32 bpf_skb_net_base_len(const struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): return sizeof(struct iphdr); case htons(ETH_P_IPV6): return sizeof(struct ipv6hdr); default: return ~0U; } } #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_DECAP_L3_IPV6) #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; unsigned int gso_type = SKB_GSO_DODGY; int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_cow_head(skb, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -ENOTSUPP; if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && inner_mac_len < ETH_HLEN) return -EINVAL; if (skb->encapsulation) return -EALREADY; mac_len = skb->network_header - skb->mac_header; inner_net = skb->network_header; if (inner_mac_len > len_diff) return -EINVAL; inner_trans = skb->transport_header; } ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (encap) { skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) skb_set_inner_protocol(skb, htons(ETH_P_TEB)); else skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) gso_type |= SKB_GSO_UDP_TUNNEL; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) gso_type |= SKB_GSO_GRE; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) gso_type |= SKB_GSO_IPXIP6; else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) gso_type |= SKB_GSO_IPXIP4; if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); skb_set_transport_header(skb, mac_len + nh_len); } /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); } if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= gso_type; shinfo->gso_segs = 0; /* Due to header growth, MSS needs to be downgraded. * There is a BUG_ON() when segmenting the frag_list with * head_frag true, so linearize the skb after downgrading * the MSS. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { skb_decrease_gso_size(shinfo, len_diff); if (shinfo->frag_list) return skb_linearize(skb); } } return 0; } static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) { int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { /* udp gso_size delineates datagrams, only allow if fixed */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) return -ENOTSUPP; } ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) skb->protocol = htons(ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) skb->protocol = htons(ETH_P_IP); if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } return 0; } #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_diff_abs = abs(len_diff); bool shrink = len_diff < 0; int ret = 0; if (unlikely(flags || mode)) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (!shrink) { ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; __skb_push(skb, len_diff_abs); memset(skb->data, 0, len_diff_abs); } else { if (unlikely(!pskb_may_pull(skb, len_diff_abs))) return -ENOMEM; __skb_pull(skb, len_diff_abs); } if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); rxm->full_len += len_diff; } return ret; } static const struct bpf_func_proto sk_skb_adjust_room_proto = { .func = sk_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb->protocol; bool shrink = len_diff < 0; u32 off; int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; off = skb_mac_header_len(skb); switch (mode) { case BPF_ADJ_ROOM_NET: off += bpf_skb_net_base_len(skb); break; case BPF_ADJ_ROOM_MAC: break; default: return -ENOTSUPP; } if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { if (!shrink) return -EINVAL; switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: len_min = sizeof(struct iphdr); break; case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: len_min = sizeof(struct ipv6hdr); break; default: return -EINVAL; } } len_cur = skb->len - skb_network_offset(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && !skb_is_gso(skb)))) return -ENOTSUPP; ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static u32 __bpf_skb_min_len(const struct sk_buff *skb) { int offset = skb_network_offset(skb); u32 min_len = 0; if (offset > 0) min_len = offset; if (skb_transport_header_was_set(skb)) { offset = skb_transport_offset(skb); if (offset > 0) min_len = offset; } if (skb->ip_summed == CHECKSUM_PARTIAL) { offset = skb_checksum_start_offset(skb) + skb->csum_offset + sizeof(__sum16); if (offset > 0) min_len = offset; } return min_len; } static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; int ret; ret = __skb_grow_rcsum(skb, new_len); if (!ret) memset(skb->data + old_len, 0, new_len - old_len); return ret; } static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) { return __skb_trim_rcsum(skb, new_len); } static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) return -EINVAL; if (skb->encapsulation) return -ENOTSUPP; /* The basic idea of this helper is that it's performing the * needed work to either grow or trim an skb, and eBPF program * rewrites the rest via helpers like bpf_skb_store_bytes(), * bpf_lX_csum_replace() and others rather than passing a raw * buffer here. This one is a slow path helper and intended * for replies with control messages. * * Like in bpf_skb_change_proto(), we want to keep this rather * minimal and without protocol specifics so that we are able * to separate concerns as in bpf_skb_store_bytes() should only * be the one responsible for writing buffers. * * It's really expected to be a slow path operation here for * control message replies, so we're implicitly linearizing, * uncloning and drop offloads from the skb by this. */ ret = __bpf_try_make_writable(skb, skb->len); if (!ret) { if (new_len > skb->len) ret = bpf_skb_grow_rcsum(skb, new_len); else if (new_len < skb->len) ret = bpf_skb_trim_rcsum(skb, new_len); if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } return ret; } BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_tail_proto = { .func = bpf_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { .func = sk_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; ret = skb_cow(skb, head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that * skb->protocol network header, etc, stay as is. * Compared to bpf_skb_change_tail(), we're more * flexible due to not needing to linearize or * reset GSO. Intention for this helper is to be * used by an L3 skb that needs to push mac header * for redirection into L2 device. */ __skb_push(skb, head_room); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); } return ret; } BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { int ret = __bpf_skb_change_head(skb, head_room, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { .func = bpf_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { .func = sk_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) { return xdp_get_buff_len(xdp); } static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { .func = bpf_xdp_get_buff_len, .gpl_only = false, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], }; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; } BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); xdp->data_meta += offset; xdp->data = data; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .func = bpf_xdp_adjust_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { unsigned long ptr_len, ptr_off = 0; skb_frag_t *next_frag, *end_frag; struct skb_shared_info *sinfo; void *src, *dst; u8 *ptr_buf; if (likely(xdp->data_end - xdp->data >= off + len)) { src = flush ? buf : xdp->data + off; dst = flush ? xdp->data + off : buf; memcpy(dst, src, len); return; } sinfo = xdp_get_shared_info_from_buff(xdp); end_frag = &sinfo->frags[sinfo->nr_frags]; next_frag = &sinfo->frags[0]; ptr_len = xdp->data_end - xdp->data; ptr_buf = xdp->data; while (true) { if (off < ptr_off + ptr_len) { unsigned long copy_off = off - ptr_off; unsigned long copy_len = min(len, ptr_len - copy_off); src = flush ? buf : ptr_buf + copy_off; dst = flush ? ptr_buf + copy_off : buf; memcpy(dst, src, copy_len); off += copy_len; len -= copy_len; buf += copy_len; } if (!len || next_frag == end_frag) break; ptr_off += ptr_len; ptr_buf = skb_frag_address(next_frag); ptr_len = skb_frag_size(next_frag); next_frag++; } } void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { u32 size = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; void *addr = xdp->data; int i; if (unlikely(offset > 0xffff || len > 0xffff)) return ERR_PTR(-EFAULT); if (unlikely(offset + len > xdp_get_buff_len(xdp))) return ERR_PTR(-EINVAL); if (likely(offset < size)) /* linear area */ goto out; sinfo = xdp_get_shared_info_from_buff(xdp); offset -= size; for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ u32 frag_size = skb_frag_size(&sinfo->frags[i]); if (offset < frag_size) { addr = skb_frag_address(&sinfo->frags[i]); size = frag_size; break; } offset -= frag_size; } out: return offset + len <= size ? addr + offset : NULL; } BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, false); else memcpy(buf, ptr, len); return 0; } static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { .func = bpf_xdp_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_load_bytes(xdp, offset, buf, len); } BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, void *, buf, u32, len) { void *ptr; ptr = bpf_xdp_pointer(xdp, offset, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); if (!ptr) bpf_xdp_copy_buf(xdp, offset, buf, len, true); else memcpy(ptr, buf, len); return 0; } static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .func = bpf_xdp_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return ____bpf_xdp_store_bytes(xdp, offset, buf, len); } static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; struct xdp_rxq_info *rxq = xdp->rxq; unsigned int tailroom; if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) return -EOPNOTSUPP; tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag); if (unlikely(offset > tailroom)) return -EINVAL; memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); skb_frag_size_add(frag, offset); sinfo->xdp_frags_size += offset; if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) xsk_buff_get_tail(xdp)->data_end += offset; return 0; } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); __xdp_return(0, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } if (release) __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); out: return release; } static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); int i, n_frags_free = 0, len_free = 0; if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) return -EINVAL; for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { skb_frag_t *frag = &sinfo->frags[i]; int shrink = min_t(int, offset, skb_frag_size(frag)); len_free += shrink; offset -= shrink; if (bpf_xdp_shrink_data(xdp, frag, shrink)) { n_frags_free++; } else { skb_frag_size_sub(frag, shrink); break; } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); xdp->data_end -= offset; } return 0; } BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ if (offset < 0) return bpf_xdp_frags_shrink_tail(xdp, -offset); return bpf_xdp_frags_increase_tail(xdp, offset); } /* Notice that xdp_data_hard_end have reserved some tailroom */ if (unlikely(data_end > data_hard_end)) return -EINVAL; if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; /* Clear memory area on grow, can contain uninit kernel memory */ if (offset > 0) memset(xdp->data_end, 0, offset); xdp->data_end = data_end; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { .func = bpf_xdp_adjust_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .func = bpf_xdp_adjust_meta, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; /** * DOC: xdp redirect * * XDP_REDIRECT works by a three-step process, implemented in the functions * below: * * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target * of the redirect and store it (along with some other metadata) in a per-CPU * struct bpf_redirect_info. * * 2. When the program returns the XDP_REDIRECT return code, the driver will * call xdp_do_redirect() which will use the information in struct * bpf_redirect_info to actually enqueue the frame into a map type-specific * bulk queue structure. * * 3. Before exiting its NAPI poll loop, the driver will call * xdp_do_flush(), which will flush all the different bulk queues, * thus completing the redirect. Note that xdp_do_flush() must be * called before napi_complete_done() in the driver, as the * XDP_REDIRECT logic relies on being inside a single NAPI instance * through to the xdp_do_flush() call for RCU protection of all * in-kernel data structures. */ /* * Pointers to the map entries will be kept around for this whole sequence of * steps, protected by RCU. However, there is no top-level rcu_read_lock() in * the core code; instead, the RCU protection relies on everything happening * inside a single NAPI poll sequence, which means it's between a pair of calls * to local_bh_disable()/local_bh_enable(). * * The map entries are marked as __rcu and the map code makes sure to * dereference those pointers with rcu_dereference_check() in a way that works * for both sections that to hold an rcu_read_lock() and sections that are * called from NAPI without a separate rcu_read_lock(). The code below does not * use RCU annotations, but relies on those in the map code. */ void xdp_do_flush(void) { struct list_head *lh_map, *lh_dev, *lh_xsk; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) __dev_flush(lh_dev); if (lh_map) __cpu_map_flush(lh_map); if (lh_xsk) __xsk_map_flush(lh_xsk); } EXPORT_SYMBOL_GPL(xdp_do_flush); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi) { struct list_head *lh_map, *lh_dev, *lh_xsk; bool missed = false; bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); if (lh_dev) { __dev_flush(lh_dev); missed = true; } if (lh_map) { __cpu_map_flush(lh_map); missed = true; } if (lh_xsk) { __xsk_map_flush(lh_xsk); missed = true; } WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", napi->poll); } #endif DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct net_device *master, *slave; master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); if (slave && slave != xdp->rxq->dev) { /* The target device is different from the receiving device, so * redirect it to the new device. * Using XDP_REDIRECT gets the correct behaviour from XDP enabled * drivers to unmap the packet from their rx ring. */ ri->tgt_index = slave->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } return XDP_TX; } EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, const struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; err = __xsk_map_redirect(fwd, xdp); if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (unlikely(!xdpf)) { err = -EOVERFLOW; goto err; } switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_enqueue_multi(xdpf, dev, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; break; } err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; default: err = -EBADRQC; } if (unlikely(err)) goto err; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); } EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog, void *fwd, enum bpf_map_type map_type, u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; int err; switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; case BPF_MAP_TYPE_DEVMAP_HASH: if (unlikely(flags & BPF_F_BROADCAST)) { map = READ_ONCE(ri->map); /* The map pointer is cleared when the map is being torn * down by dev_map_free() */ if (unlikely(!map)) { err = -ENOENT; break; } WRITE_ONCE(ri->map, NULL); err = dev_map_redirect_multi(dev, skb, xdp_prog, map, flags & BPF_F_EXCLUDE_INGRESS); } else { err = dev_map_generic_redirect(fwd, skb, xdp_prog); } if (unlikely(err)) goto err; break; case BPF_MAP_TYPE_XSKMAP: err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); break; case BPF_MAP_TYPE_CPUMAP: err = cpu_map_generic_redirect(fwd, skb); if (unlikely(err)) goto err; break; default: err = -EBADRQC; goto err; } _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; u32 flags = ri->flags; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->flags = 0; ri->map_type = BPF_MAP_TYPE_UNSPEC; if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } err = xdp_ok_fwd_dev(fwd, skb->len); if (unlikely(err)) goto err; skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); generic_xdp_tx(skb, xdp_prog); return 0; } return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); err: _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); if (unlikely(flags)) return XDP_ABORTED; /* NB! Map type UNSPEC and map_id == INT_MAX (never generated * by map_idr) is used for ifindex based XDP redirect. */ ri->tgt_index = ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } static const struct bpf_func_proto bpf_xdp_redirect_proto = { .func = bpf_xdp_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, u64, flags) { return map->ops->map_redirect(map, key, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len); return 0; } BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!skb || skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); } static const struct bpf_func_proto bpf_skb_event_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) const struct bpf_func_proto bpf_skb_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, u32, size, u64, flags) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; int err; if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { err = -EPROTO; goto err_clear; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) goto err_clear; set_compat: to = (struct bpf_tunnel_key *)compat; break; default: goto err_clear; } } to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; if (flags & BPF_F_TUNINFO_FLAGS) to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); else to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); memcpy(to->local_ipv6, &info->key.u.ipv6.dst, sizeof(to->local_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy(to_orig, to, size); return 0; err_clear: memset(to_orig, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; if (unlikely(!info || !ip_tunnel_is_options_present(info->key.tun_flags))) { err = -ENOENT; goto err_clear; } if (unlikely(size < info->options_len)) { err = -ENOMEM; goto err_clear; } ip_tunnel_info_opts_get(to, info); if (size > info->options_len) memset(to + info->options_len, 0, size - info->options_len); return info->options_len; err_clear: memset(to, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .func = bpf_skb_get_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, const struct bpf_tunnel_key *, from, u32, size, u64, flags) { struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | BPF_F_NO_TUNNEL_KEY))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, local_ipv6[0]): case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; } } if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, flags & BPF_F_DONT_FRAGMENT); __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, !(flags & BPF_F_ZERO_CSUM_TX)); __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, flags & BPF_F_SEQ_NUMBER); __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, !(flags & BPF_F_NO_TUNNEL_KEY)); info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; if (flags & BPF_F_TUNINFO_IPV6) { info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); memcpy(&info->key.u.ipv6.src, from->local_ipv6, sizeof(from->local_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); info->key.flow_flags = FLOWI_FLAG_ANYSRC; } return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, const u8 *, from, u32, size) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); IP_TUNNEL_DECLARE_FLAGS(present) = { }; if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; ip_tunnel_set_options_present(present); ip_tunnel_info_opts_set(info, from, size, present); return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .func = bpf_skb_set_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { struct metadata_dst __percpu *tmp; tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, METADATA_IP_TUNNEL, GFP_KERNEL); if (!tmp) return NULL; if (cmpxchg(&md_dst, NULL, tmp)) metadata_dst_free_percpu(tmp); } switch (which) { case BPF_FUNC_skb_set_tunnel_key: return &bpf_skb_set_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_opt: return &bpf_skb_set_tunnel_opt_proto; default: return NULL; } } BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .func = bpf_skb_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SOCK_CGROUP_DATA static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .func = bpf_skb_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, int ancestor_level) { struct cgroup *ancestor; struct cgroup *cgrp; sk = sk_to_full_sk(sk); if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) return 0; return cgroup_id(ancestor); } BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) { return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) { return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); } static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif static unsigned long bpf_xdp_copy(void *dst, const void *ctx, unsigned long off, unsigned long len) { struct xdp_buff *xdp = (struct xdp_buff *)ctx; bpf_xdp_copy_buf(xdp, off, dst, len, false); return 0; } BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) const struct bpf_func_proto bpf_xdp_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? __sock_gen_cookie(skb->sk) : 0; } static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .func = bpf_get_socket_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .func = bpf_get_socket_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) { return __sock_gen_cookie(ctx); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .func = bpf_get_socket_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) { return sk ? sock_gen_cookie(sk) : 0; } const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { .func = bpf_get_socket_ptr_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .func = bpf_get_socket_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static u64 __bpf_get_netns_cookie(struct sock *sk) { const struct net *net = sk ? sock_net(sk) : &init_net; return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) { return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_proto = { .func = bpf_get_netns_cookie, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { .func = bpf_get_netns_cookie_sock, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .func = bpf_get_netns_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { .func = bpf_get_netns_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) { return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); } static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { .func = bpf_get_netns_cookie_sk_msg, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); kuid_t kuid; if (!sk || !sk_fullsock(sk)) return overflowuid; kuid = sock_net_uid(sock_net(sk), sk); return from_kuid_munged(sock_net(sk)->user_ns, kuid); } static const struct bpf_func_proto bpf_get_socket_uid_proto = { .func = bpf_get_socket_uid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) { u32 sk_bpf_cb_flags; if (getopt) { *(u32 *)optval = sk->sk_bpf_cb_flags; return 0; } sk_bpf_cb_flags = *(u32 *)optval; if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) return -EINVAL; sk->sk_bpf_cb_flags = sk_bpf_cb_flags; return 0; } static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { switch (optname) { case SO_REUSEADDR: case SO_SNDBUF: case SO_RCVBUF: case SO_KEEPALIVE: case SO_PRIORITY: case SO_REUSEPORT: case SO_RCVLOWAT: case SO_MARK: case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; case SO_BINDTODEVICE: break; default: return -EINVAL; } if (optname == SK_BPF_CB_FLAGS) return sk_bpf_set_get_cb_flags(sk, optval, getopt); if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; return sk_getsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return sk_setsockopt(sk, SOL_SOCKET, optname, KERNEL_SOCKPTR(optval), *optlen); } static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, char *optval, int optlen) { if (optlen != sizeof(int)) return -EINVAL; switch (optname) { case TCP_BPF_SOCK_OPS_CB_FLAGS: { int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags; memcpy(optval, &cb_flags, optlen); break; } case TCP_BPF_RTO_MIN: { int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min); memcpy(optval, &rto_min_us, optlen); break; } case TCP_BPF_DELACK_MAX: { int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max); memcpy(optval, &delack_max_us, optlen); break; } default: return -EINVAL; } return 0; } static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); unsigned long timeout; int val; if (optlen != sizeof(int)) return -EINVAL; val = *(int *)optval; /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; tcp_snd_cwnd_set(tp, val); break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) return -EINVAL; tp->snd_cwnd_clamp = val; tp->snd_ssthresh = val; break; case TCP_BPF_DELACK_MAX: timeout = usecs_to_jiffies(val); if (timeout > TCP_DELACK_MAX || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_delack_max = timeout; break; case TCP_BPF_RTO_MIN: timeout = usecs_to_jiffies(val); if (timeout > TCP_RTO_MIN || timeout < TCP_TIMEOUT_MIN) return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; case TCP_BPF_SOCK_OPS_CB_FLAGS: if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) return -EINVAL; tp->bpf_sock_ops_cb_flags = val; break; default: return -EINVAL; } return 0; } static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, int *optlen, bool getopt) { struct tcp_sock *tp; int ret; if (*optlen < 2) return -EINVAL; if (getopt) { if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; /* BPF expects NULL-terminated tcp-cc string */ optval[--(*optlen)] = '\0'; return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } /* "cdg" is the only cc that alloc a ptr * in inet_csk_ca area. The bpf-tcp-cc may * overwrite this ptr after switching to cdg. */ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) return -ENOTSUPP; /* It stops this looping * * .init => bpf_setsockopt(tcp_cc) => .init => * bpf_setsockopt(tcp_cc)" => .init => .... * * The second bpf_setsockopt(tcp_cc) is not allowed * in order to break the loop when both .init * are the same bpf prog. * * This applies even the second bpf_setsockopt(tcp_cc) * does not cause a loop. This limits only the first * '.init' can call bpf_setsockopt(TCP_CONGESTION) to * pick a fallback cc (eg. peer does not support ECN) * and the second '.init' cannot fallback to * another. */ tp = tcp_sk(sk); if (tp->bpf_chg_cc_inprogress) return -EBUSY; tp->bpf_chg_cc_inprogress = 1; ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, KERNEL_SOCKPTR(optval), *optlen); tp->bpf_chg_cc_inprogress = 0; return ret; } static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_protocol != IPPROTO_TCP) return -EINVAL; switch (optname) { case TCP_NODELAY: case TCP_MAXSEG: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPCNT: case TCP_SYNCNT: case TCP_WINDOW_CLAMP: case TCP_THIN_LINEAR_TIMEOUTS: case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: case TCP_RTO_MAX_MS: if (*optlen != sizeof(int)) return -EINVAL; break; case TCP_CONGESTION: return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); case TCP_SAVED_SYN: if (*optlen < 1) return -EINVAL; break; default: if (getopt) return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen); return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } if (getopt) { if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); if (!tp->saved_syn || *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; memcpy(optval, tp->saved_syn->data, *optlen); /* It cannot free tp->saved_syn here because it * does not know if the user space still needs it. */ return 0; } return do_tcp_getsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); } return do_tcp_setsockopt(sk, SOL_TCP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ip_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET) return -EINVAL; switch (optname) { case IP_TOS: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return do_ip_getsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return do_ip_setsockopt(sk, SOL_IP, optname, KERNEL_SOCKPTR(optval), *optlen); } static int sol_ipv6_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { if (sk->sk_family != AF_INET6) return -EINVAL; switch (optname) { case IPV6_TCLASS: case IPV6_AUTOFLOWLABEL: if (*optlen != sizeof(int)) return -EINVAL; break; default: return -EINVAL; } if (getopt) return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), KERNEL_SOCKPTR(optlen)); return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, KERNEL_SOCKPTR(optval), *optlen); } static int __bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (!sk_fullsock(sk)) return -EINVAL; if (level == SOL_SOCKET) return sol_socket_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) return sol_ip_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) return sol_tcp_sockopt(sk, optname, optval, &optlen, false); return -EINVAL; } static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) { return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; } static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_setsockopt(sk, level, optname, optval, optlen); } static int __bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { int err, saved_optlen = optlen; if (!sk_fullsock(sk)) { err = -EINVAL; goto done; } if (level == SOL_SOCKET) err = sol_socket_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) err = sol_ip_sockopt(sk, optname, optval, &optlen, true); else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); else err = -EINVAL; done: if (err) optlen = 0; if (optlen < saved_optlen) memset(optval + optlen, 0, saved_optlen - optlen); return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); return __bpf_getsockopt(sk, level, optname, optval, optlen); } BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_setsockopt_proto = { .func = bpf_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_sk_getsockopt_proto = { .func = bpf_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_setsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { .func = bpf_unlocked_sk_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { return __bpf_getsockopt(sk, level, optname, optval, optlen); } const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { .func = bpf_unlocked_sk_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .func = bpf_sock_addr_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .func = bpf_sock_addr_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .func = bpf_sock_ops_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, int optname, const u8 **start) { struct sk_buff *syn_skb = bpf_sock->syn_skb; const u8 *hdr_start; int ret; if (syn_skb) { /* sk is a request_sock here */ if (optname == TCP_BPF_SYN) { hdr_start = syn_skb->data; ret = tcp_hdrlen(syn_skb); } else if (optname == TCP_BPF_SYN_IP) { hdr_start = skb_network_header(syn_skb); ret = skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } else { /* optname == TCP_BPF_SYN_MAC */ hdr_start = skb_mac_header(syn_skb); ret = skb_mac_header_len(syn_skb) + skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); } } else { struct sock *sk = bpf_sock->sk; struct saved_syn *saved_syn; if (sk->sk_state == TCP_NEW_SYN_RECV) /* synack retransmit. bpf_sock->syn_skb will * not be available. It has to resort to * saved_syn (if it is saved). */ saved_syn = inet_reqsk(sk)->saved_syn; else saved_syn = tcp_sk(sk)->saved_syn; if (!saved_syn) return -ENOENT; if (optname == TCP_BPF_SYN) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen + saved_syn->network_hdrlen; ret = saved_syn->tcp_hdrlen; } else if (optname == TCP_BPF_SYN_IP) { hdr_start = saved_syn->data + saved_syn->mac_hdrlen; ret = saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } else { /* optname == TCP_BPF_SYN_MAC */ /* TCP_SAVE_SYN may not have saved the mac hdr */ if (!saved_syn->mac_hdrlen) return -ENOENT; hdr_start = saved_syn->data; ret = saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } } *start = hdr_start; return ret; } BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; const u8 *start; ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); if (ret > 0) { copy_len = ret; if (optlen < copy_len) { copy_len = optlen; ret = -ENOSPC; } memcpy(optval, start, copy_len); } /* Zero out unused buffer at the end */ memset(optval + copy_len, 0, optlen - copy_len); return ret; } return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, int, argval) { struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .func = bpf_sock_ops_cb_flags_set, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; EXPORT_SYMBOL_GPL(ipv6_bpf_stub); BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, int, addr_len) { #ifdef CONFIG_INET struct sock *sk = ctx->sk; u32 flags = BIND_FROM_BPF; int err; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_bind_proto = { .func = bpf_bind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; #ifdef CONFIG_XFRM #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) struct metadata_dst __percpu *xfrm_bpf_md_dst; EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); #endif BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { const struct sec_path *sp = skb_sec_path(skb); const struct xfrm_state *x; if (!sp || unlikely(index >= sp->len || flags)) goto err_clear; x = sp->xvec[index]; if (unlikely(size != sizeof(struct bpf_xfrm_state))) goto err_clear; to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; to->ext = 0; if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; err_clear: memset(to, 0, size); return -EINVAL; } static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { .func = bpf_skb_get_xfrm_state, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) { params->h_vlan_TCI = 0; params->h_vlan_proto = 0; if (mtu) params->mtu_result = mtu; /* union with tot_len */ return 0; } #endif #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct fib_nh_common *nhc; struct in_device *in_dev; struct neighbour *neigh; struct net_device *dev; struct fib_result res; struct flowi4 fl4; u32 mtu = 0; int err; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; fl4.flowi4_oif = params->ifindex; } else { fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } fl4.flowi4_tos = params->tos & INET_DSCP_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.flowi4_proto = params->l4_protocol; fl4.daddr = params->ipv4_dst; fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; fl4.flowi4_multipath_hash = 0; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl4.flowi4_mark = params->mark; else fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } if (err) { /* map fib lookup errors to RTN_ type */ if (err == -EINVAL) return BPF_FIB_LKUP_RET_BLACKHOLE; if (err == -EHOSTUNREACH) return BPF_FIB_LKUP_RET_UNREACHABLE; if (err == -EACCES) return BPF_FIB_LKUP_RET_PROHIBIT; return BPF_FIB_LKUP_RET_NOT_FWDED; } if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; if (fib_info_num_path(res.fi) > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } nhc = res.nhc; /* do not handle lwt encaps right now */ if (nhc->nhc_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nhc->nhc_dev; params->rt_metric = res.fi->fib_priority; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) params->ipv4_src = fib_result_prefsrc(net, &res); /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ if (likely(nhc->nhc_gw_family != AF_INET6)) { if (nhc->nhc_gw_family) params->ipv4_dst = nhc->nhc_gw.ipv4; } else { struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; params->family = AF_INET6; *dst = nhc->nhc_gw.ipv6; } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; if (likely(nhc->nhc_gw_family != AF_INET6)) neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); else neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; struct fib6_result res = {}; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; struct flowi6 fl6; int strict = 0; int oif, err; u32 mtu = 0; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; oif = fl6.flowi6_oif = params->ifindex; } else { oif = fl6.flowi6_iif = params->ifindex; fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; fl6.flowi6_proto = params->l4_protocol; fl6.daddr = *dst; fl6.saddr = *src; fl6.fl6_sport = params->sport; fl6.fl6_dport = params->dport; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; if (flags & BPF_FIB_LOOKUP_TBID) { tbid = params->tbid; /* zero out for vlan output */ params->tbid = 0; } tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, strict); } else { if (flags & BPF_FIB_LOOKUP_MARK) fl6.flowi6_mark = params->mark; else fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); } if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || res.f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; switch (res.fib6_type) { /* only unicast is forwarded */ case RTN_UNICAST: break; case RTN_BLACKHOLE: return BPF_FIB_LKUP_RET_BLACKHOLE; case RTN_UNREACHABLE: return BPF_FIB_LKUP_RET_UNREACHABLE; case RTN_PROHIBIT: return BPF_FIB_LKUP_RET_PROHIBIT; default: return BPF_FIB_LKUP_RET_NOT_FWDED; } ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, fl6.flowi6_oif != 0, NULL, strict); if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); if (params->tot_len > mtu) { params->mtu_result = mtu; /* union with tot_len */ return BPF_FIB_LKUP_RET_FRAG_NEEDED; } } if (res.nh->fib_nh_lws) return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (res.nh->fib_nh_gw_family) *dst = res.nh->fib_nh_gw6; dev = res.nh->fib_nh_dev; params->rt_metric = res.f6i->fib6_metric; params->ifindex = dev->ifindex; if (flags & BPF_FIB_LOOKUP_SRC) { if (res.f6i->fib6_prefsrc.plen) { *src = res.f6i->fib6_prefsrc.addr; } else { err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, &fl6.daddr, 0, src); if (err) return BPF_FIB_LKUP_RET_NO_SRC_ADDR; } } if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) goto set_fwd_params; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. */ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) return BPF_FIB_LKUP_RET_NO_NEIGH; memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); set_fwd_params: return bpf_fib_set_fwd_params(params, mtu); } #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif } return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .func = bpf_xdp_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; if (flags & ~BPF_FIB_LOOKUP_MASK) return -EINVAL; if (params->tot_len) check_mtu = true; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; /* When tot_len isn't provided by user, check skb * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; params->mtu_result = dev->mtu; /* union with tot_len */ } return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .func = bpf_skb_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, u32 ifindex) { struct net *netns = dev_net(dev_curr); /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ if (ifindex == 0) return dev_curr; return dev_get_by_index_rcu(netns, ifindex); } BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; struct net_device *dev = skb->dev; int mtu, dev_len, skb_len; if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) return -EINVAL; if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; skb_len += len_diff; /* minus result pass check */ if (skb_len <= dev_len) { ret = BPF_MTU_CHK_RET_SUCCESS; goto out; } /* At this point, skb->len exceed MTU, but as it include length of all * segments, it can still be below MTU. The SKB can possibly get * re-segmented in transmit path (see validate_xmit_skb). Thus, user * must choose if segs are to be MTU checked. */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; if (flags & BPF_MTU_CHK_SEGS && !skb_gso_validate_network_len(skb, mtu)) ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; } out: *mtu_len = mtu; return ret; } BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) { struct net_device *dev = xdp->rxq->dev; int xdp_len = xdp->data_end - xdp->data; int ret = BPF_MTU_CHK_RET_SUCCESS; int mtu, dev_len; /* XDP variant doesn't support multi-buffer segment check (yet) */ if (unlikely(flags)) return -EINVAL; dev = __dev_via_ifindex(dev, ifindex); if (unlikely(!dev)) return -ENODEV; mtu = READ_ONCE(dev->mtu); dev_len = mtu + dev->hard_header_len; /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ if (*mtu_len) xdp_len = *mtu_len + dev->hard_header_len; xdp_len += len_diff; /* minus result pass check */ if (xdp_len > dev_len) ret = BPF_MTU_CHK_RET_FRAG_NEEDED; *mtu_len = mtu; return ret; } static const struct bpf_func_proto bpf_skb_check_mtu_proto = { .func = bpf_skb_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { .func = bpf_xdp_check_mtu, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u32), .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; if (!seg6_validate_srh(srh, len, false)) return -EINVAL; switch (type) { case BPF_LWT_ENCAP_SEG6_INLINE: if (skb->protocol != htons(ETH_P_IPV6)) return -EBADMSG; err = seg6_do_srh_inline(skb, srh); break; case BPF_LWT_ENCAP_SEG6: skb_reset_inner_headers(skb); skb->encapsulation = 1; err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); break; default: return -EINVAL; } bpf_compute_data_pointers(skb); if (err) return err; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); } #endif /* CONFIG_IPV6_SEG6_BPF */ #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); } #endif BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); #endif #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); #endif default: return -EINVAL; } } BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) case BPF_LWT_ENCAP_IP: return bpf_push_ip_encap(skb, hdr, len, false /* egress */); #endif default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .func = bpf_lwt_in_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; lockdep_assert_held(&srh_state->bh_lock); if (srh == NULL) return -EINVAL; srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .func = bpf_lwt_seg6_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { srh_state->srh = NULL; } else { srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen = srh_state->srh->hdrlen << 3; srh_state->valid = true; } } BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int hdroff = 0; int err; lockdep_assert_held(&srh_state->bh_lock); switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_DT6: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) return -EBADMSG; if (!pskb_pull(skb, hdroff)) return -EBADMSG; skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; bpf_compute_data_pointers(skb); bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) bpf_update_srh_state(skb); return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) bpf_update_srh_state(skb); return err; default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .func = bpf_lwt_seg6_action, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; struct ipv6hdr *hdr; int srhoff = 0; int ret; lockdep_assert_held(&srh_state->bh_lock); if (unlikely(srh == NULL)) return -EINVAL; srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (unlikely(ptr < srh_tlvs || ptr > srh_end)) return -EFAULT; if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) return -EFAULT; if (len > 0) { ret = skb_cow_head(skb, len); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, offset, len); } else { ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); } bpf_compute_data_pointers(skb); if (unlikely(ret < 0)) return ret; hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; srh_state->valid = false; return 0; } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .func = bpf_lwt_seg6_adjust_srh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_IPV6_SEG6_BPF */ #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; if (family == AF_INET) { __be32 src4 = tuple->ipv4.saddr; __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) sk = __inet_lookup(net, hinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, net->ipv4.udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) sk = __inet6_lookup(net, hinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); else if (likely(ipv6_bpf_stub)) sk = ipv6_bpf_stub->udp6_lib_lookup(net, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, net->ipv4.udp_table, NULL); #endif } if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); sk = NULL; } return sk; } /* bpf_skc_lookup performs the core lookup for different types of sockets, * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. */ static struct sock * __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = NULL; struct net *net; u8 family; if (len == sizeof(tuple->ipv4)) family = AF_INET; else if (len == sizeof(tuple->ipv6)) family = AF_INET6; else return NULL; if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; if (sdif < 0) { if (family == AF_INET) sdif = inet_sdif(skb); else sdif = inet6_sdif(skb); } if ((s32)netns_id < 0) { net = caller_net; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); } else { net = get_net_ns_by_id(caller_net, netns_id); if (unlikely(!net)) goto out; sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); put_net(net); } out: return sk; } static struct sock * __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, u64 flags, int sdif) { struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, sdif); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } static struct sock * bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct net *caller_net; int ifindex; if (skb->dev) { caller_net = dev_net(skb->dev); ifindex = skb->dev->ifindex; } else { caller_net = sock_net(skb->sk); ifindex = 0; } return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags, -1); } static struct sock * bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, u8 proto, u64 netns_id, u64 flags) { struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, flags); if (sk) { struct sock *sk2 = sk_to_full_sk(sk); /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); return NULL; } sk = sk2; } } return sk; } BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .func = bpf_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .func = bpf_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); } static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .func = bpf_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { .func = bpf_tc_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { .func = bpf_tc_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { struct net_device *dev = skb->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { .func = bpf_tc_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_sk_release, struct sock *, sk) { if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_UDP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .func = bpf_xdp_sk_lookup_udp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .func = bpf_xdp_skc_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) { struct net_device *dev = ctx->rxq->dev; int ifindex = dev->ifindex, sdif = dev_sdif(dev); struct net *caller_net = dev_net(dev); return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, ifindex, IPPROTO_TCP, netns_id, flags, sdif); } static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .func = bpf_xdp_sk_lookup_tcp, .gpl_only = false, .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .func = bpf_sock_addr_skc_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_TCP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .func = bpf_sock_addr_sk_lookup_tcp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) { return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, sock_net(ctx->sk), 0, IPPROTO_UDP, netns_id, flags, -1); } static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .func = bpf_sock_addr_sk_lookup_udp, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, icsk_retransmits)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_tcp_sock, bytes_received): case offsetof(struct bpf_tcp_sock, bytes_acked): return size == sizeof(__u64); default: return size == sizeof(__u32); } } u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_TCP_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct tcp_sock, FIELD)); \ } while (0) #define BPF_INET_SOCK_GET_COMMON(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ FIELD) > \ sizeof_field(struct bpf_tcp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct inet_connection_sock, \ FIELD), \ si->dst_reg, si->src_reg, \ offsetof( \ struct inet_connection_sock, \ FIELD)); \ } while (0) BTF_TYPE_EMIT(struct bpf_tcp_sock); switch (si->off) { case offsetof(struct bpf_tcp_sock, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct tcp_sock, rtt_min) + offsetof(struct minmax_sample, v)); break; case offsetof(struct bpf_tcp_sock, snd_cwnd): BPF_TCP_SOCK_GET_COMMON(snd_cwnd); break; case offsetof(struct bpf_tcp_sock, srtt_us): BPF_TCP_SOCK_GET_COMMON(srtt_us); break; case offsetof(struct bpf_tcp_sock, snd_ssthresh): BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); break; case offsetof(struct bpf_tcp_sock, rcv_nxt): BPF_TCP_SOCK_GET_COMMON(rcv_nxt); break; case offsetof(struct bpf_tcp_sock, snd_nxt): BPF_TCP_SOCK_GET_COMMON(snd_nxt); break; case offsetof(struct bpf_tcp_sock, snd_una): BPF_TCP_SOCK_GET_COMMON(snd_una); break; case offsetof(struct bpf_tcp_sock, mss_cache): BPF_TCP_SOCK_GET_COMMON(mss_cache); break; case offsetof(struct bpf_tcp_sock, ecn_flags): BPF_TCP_SOCK_GET_COMMON(ecn_flags); break; case offsetof(struct bpf_tcp_sock, rate_delivered): BPF_TCP_SOCK_GET_COMMON(rate_delivered); break; case offsetof(struct bpf_tcp_sock, rate_interval_us): BPF_TCP_SOCK_GET_COMMON(rate_interval_us); break; case offsetof(struct bpf_tcp_sock, packets_out): BPF_TCP_SOCK_GET_COMMON(packets_out); break; case offsetof(struct bpf_tcp_sock, retrans_out): BPF_TCP_SOCK_GET_COMMON(retrans_out); break; case offsetof(struct bpf_tcp_sock, total_retrans): BPF_TCP_SOCK_GET_COMMON(total_retrans); break; case offsetof(struct bpf_tcp_sock, segs_in): BPF_TCP_SOCK_GET_COMMON(segs_in); break; case offsetof(struct bpf_tcp_sock, data_segs_in): BPF_TCP_SOCK_GET_COMMON(data_segs_in); break; case offsetof(struct bpf_tcp_sock, segs_out): BPF_TCP_SOCK_GET_COMMON(segs_out); break; case offsetof(struct bpf_tcp_sock, data_segs_out): BPF_TCP_SOCK_GET_COMMON(data_segs_out); break; case offsetof(struct bpf_tcp_sock, lost_out): BPF_TCP_SOCK_GET_COMMON(lost_out); break; case offsetof(struct bpf_tcp_sock, sacked_out): BPF_TCP_SOCK_GET_COMMON(sacked_out); break; case offsetof(struct bpf_tcp_sock, bytes_received): BPF_TCP_SOCK_GET_COMMON(bytes_received); break; case offsetof(struct bpf_tcp_sock, bytes_acked): BPF_TCP_SOCK_GET_COMMON(bytes_acked); break; case offsetof(struct bpf_tcp_sock, dsack_dups): BPF_TCP_SOCK_GET_COMMON(dsack_dups); break; case offsetof(struct bpf_tcp_sock, delivered): BPF_TCP_SOCK_GET_COMMON(delivered); break; case offsetof(struct bpf_tcp_sock, delivered_ce): BPF_TCP_SOCK_GET_COMMON(delivered_ce); break; case offsetof(struct bpf_tcp_sock, icsk_retransmits): BPF_INET_SOCK_GET_COMMON(icsk_retransmits); break; } return insn - insn_buf; } BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_tcp_sock_proto = { .func = bpf_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; } static const struct bpf_func_proto bpf_get_listener_sock_proto = { .func = bpf_get_listener_sock, .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); break; case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); break; default: return 0; } if (skb_headlen(skb) < iphdr_len) return 0; if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) return 0; return INET_ECN_set_ce(skb); } bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) return false; if (off % size != 0) return false; switch (off) { default: return size == sizeof(__u32); } } u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #define BPF_XDP_SOCK_GET(FIELD) \ do { \ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ sizeof_field(struct bpf_xdp_sock, FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ si->dst_reg, si->src_reg, \ offsetof(struct xdp_sock, FIELD)); \ } while (0) switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); break; } return insn - insn_buf; } static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { .func = bpf_skb_ecn_set_ce, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES int ret; if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -EINVAL; if (!th->ack || th->rst || th->syn) return -ENOENT; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; if (tcp_synq_no_recent_overflow(sk)) return -ENOENT; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; ret = __cookie_v4_check((struct iphdr *)iph, th); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; ret = __cookie_v6_check((struct ipv6hdr *)iph, th); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (ret > 0) return 0; return -ENOENT; #else return -ENOTSUPP; #endif } static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .func = bpf_tcp_check_syncookie, .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, struct tcphdr *, th, u32, th_len) { #ifdef CONFIG_SYN_COOKIES u32 cookie; u16 mss; if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) return -EINVAL; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) return -ENOENT; if (!th->syn || th->ack || th->fin || th->rst) return -EINVAL; if (unlikely(iph_len < sizeof(struct iphdr))) return -EINVAL; /* Both struct iphdr and struct ipv6hdr have the version field at the * same offset so we can cast to the shorter header (struct iphdr). */ switch (((struct iphdr *)iph)->version) { case 4: if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) return -EINVAL; mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); break; #if IS_BUILTIN(CONFIG_IPV6) case 6: if (unlikely(iph_len < sizeof(struct ipv6hdr))) return -EINVAL; if (sk->sk_family != AF_INET6) return -EINVAL; mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); break; #endif /* CONFIG_IPV6 */ default: return -EPROTONOSUPPORT; } if (mss == 0) return -ENOENT; return cookie | ((u64)mss << 32); #else return -EOPNOTSUPP; #endif /* CONFIG_SYN_COOKIES */ } static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .func = bpf_tcp_gen_syncookie, .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; if (unlikely(dev_net(skb->dev) != sock_net(sk))) return -ENETUNREACH; if (sk_unhashed(sk)) return -EOPNOTSUPP; if (sk_is_refcounted(sk) && unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) return -ENOENT; skb_orphan(skb); skb->sk = sk; skb->destructor = sock_pfree; return 0; } static const struct bpf_func_proto bpf_sk_assign_proto = { .func = bpf_sk_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_ANYTHING, }; static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, u8 search_kind, const u8 *magic, u8 magic_len, bool *eol) { u8 kind, kind_len; *eol = false; while (op < opend) { kind = op[0]; if (kind == TCPOPT_EOL) { *eol = true; return ERR_PTR(-ENOMSG); } else if (kind == TCPOPT_NOP) { op++; continue; } if (opend - op < 2 || opend - op < op[1] || op[1] < 2) /* Something is wrong in the received header. * Follow the TCP stack's tcp_parse_options() * and just bail here. */ return ERR_PTR(-EFAULT); kind_len = op[1]; if (search_kind == kind) { if (!magic_len) return op; if (magic_len > kind_len - 2) return ERR_PTR(-ENOMSG); if (!memcmp(&op[2], magic, magic_len)) return op; } op += kind_len; } return ERR_PTR(-ENOMSG); } BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, void *, search_res, u32, len, u64, flags) { bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; const u8 *op, *opend, *magic, *search = search_res; u8 search_kind, search_len, copy_len, magic_len; int ret; if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. */ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) return -EINVAL; search_kind = search[0]; search_len = search[1]; if (search_len > len || search_kind == TCPOPT_NOP || search_kind == TCPOPT_EOL) return -EINVAL; if (search_kind == TCPOPT_EXP || search_kind == 253) { /* 16 or 32 bit magic. +2 for kind and kind length */ if (search_len != 4 && search_len != 6) return -EINVAL; magic = &search[2]; magic_len = search_len - 2; } else { if (search_len) return -EINVAL; magic = NULL; magic_len = 0; } if (load_syn) { ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); if (ret < 0) return ret; opend = op + ret; op += sizeof(struct tcphdr); } else { if (!bpf_sock->skb || bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) /* This bpf_sock->op cannot call this helper */ return -EPERM; opend = bpf_sock->skb_data_end; op = bpf_sock->skb->data + sizeof(struct tcphdr); } op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, &eol); if (IS_ERR(op)) return PTR_ERR(op); copy_len = op[1]; ret = copy_len; if (copy_len > len) { ret = -ENOSPC; copy_len = len; } memcpy(search_res, op, copy_len); return ret; } static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .func = bpf_sock_ops_load_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, const void *, from, u32, len, u64, flags) { u8 new_kind, new_kind_len, magic_len = 0, *opend; const u8 *op, *new_op, *magic = NULL; struct sk_buff *skb; bool eol; if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) return -EPERM; if (len < 2 || flags) return -EINVAL; new_op = from; new_kind = new_op[0]; new_kind_len = new_op[1]; if (new_kind_len > len || new_kind == TCPOPT_NOP || new_kind == TCPOPT_EOL) return -EINVAL; if (new_kind_len > bpf_sock->remaining_opt_len) return -ENOSPC; /* 253 is another experimental kind */ if (new_kind == TCPOPT_EXP || new_kind == 253) { if (new_kind_len < 4) return -EINVAL; /* Match for the 2 byte magic also. * RFC 6994: the magic could be 2 or 4 bytes. * Hence, matching by 2 byte only is on the * conservative side but it is the right * thing to do for the 'search-for-duplication' * purpose. */ magic = &new_op[2]; magic_len = 2; } /* Check for duplication */ skb = bpf_sock->skb; op = skb->data + sizeof(struct tcphdr); opend = bpf_sock->skb_data_end; op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, &eol); if (!IS_ERR(op)) return -EEXIST; if (PTR_ERR(op) != -ENOMSG) return PTR_ERR(op); if (eol) /* The option has been ended. Treat it as no more * header option can be written. */ return -ENOSPC; /* No duplication found. Store the header option. */ memcpy(opend, from, new_kind_len); bpf_sock->remaining_opt_len -= new_kind_len; bpf_sock->skb_data_end += new_kind_len; return 0; } static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .func = bpf_sock_ops_store_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u32, len, u64, flags) { if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) return -EPERM; if (flags || len < 2) return -EINVAL; if (len > bpf_sock->remaining_opt_len) return -ENOSPC; bpf_sock->remaining_opt_len -= len; return 0; } static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .func = bpf_sock_ops_reserve_hdr_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, u64, tstamp, u32, tstamp_type) { /* skb_clear_delivery_time() is done for inet protocol */ if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; switch (tstamp_type) { case BPF_SKB_CLOCK_REALTIME: skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_REALTIME; break; case BPF_SKB_CLOCK_MONOTONIC: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_MONOTONIC; break; case BPF_SKB_CLOCK_TAI: if (!tstamp) return -EINVAL; skb->tstamp = tstamp; skb->tstamp_type = SKB_CLOCK_TAI; break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { .func = bpf_skb_set_tstamp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SYN_COOKIES BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th, u32, th_len) { u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; cookie = __cookie_v4_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th, u32, th_len) { #if IS_BUILTIN(CONFIG_IPV6) const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); u32 cookie; u16 mss; if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; cookie = __cookie_v6_init_sequence(iph, th, &mss); return cookie | ((u64)mss << 32); #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .func = bpf_tcp_raw_gen_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, struct tcphdr *, th) { if (__cookie_v4_check(iph, th) > 0) return 0; return -EACCES; } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .func = bpf_tcp_raw_check_syncookie_ipv4, .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct iphdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, struct tcphdr *, th) { #if IS_BUILTIN(CONFIG_IPV6) if (__cookie_v6_check(iph, th) > 0) return 0; return -EACCES; #else return -EPROTONOSUPPORT; #endif } static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .func = bpf_tcp_raw_check_syncookie_ipv6, .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg1_size = sizeof(struct ipv6hdr), .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_clone_redirect: case BPF_FUNC_l3_csum_replace: case BPF_FUNC_l4_csum_replace: case BPF_FUNC_lwt_push_encap: case BPF_FUNC_lwt_seg6_action: case BPF_FUNC_lwt_seg6_adjust_srh: case BPF_FUNC_lwt_seg6_store_bytes: case BPF_FUNC_msg_pop_data: case BPF_FUNC_msg_pull_data: case BPF_FUNC_msg_push_data: case BPF_FUNC_skb_adjust_room: case BPF_FUNC_skb_change_head: case BPF_FUNC_skb_change_proto: case BPF_FUNC_skb_change_tail: case BPF_FUNC_skb_pull_data: case BPF_FUNC_skb_store_bytes: case BPF_FUNC_skb_vlan_pop: case BPF_FUNC_skb_vlan_push: case BPF_FUNC_store_hdr_opt: case BPF_FUNC_xdp_adjust_head: case BPF_FUNC_xdp_adjust_meta: case BPF_FUNC_xdp_adjust_tail: /* tail-called program could call any of the above */ case BPF_FUNC_tail_call: return true; default: return false; } } const struct bpf_func_proto bpf_event_output_data_proto __weak; const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; func_proto = cgroup_current_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_bind_proto; default: return NULL; } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sock_addr_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_sock_addr_skc_lookup_tcp_proto; #endif /* CONFIG_INET */ case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_setsockopt_proto; default: return NULL; } case BPF_FUNC_getsockopt: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: return &bpf_sock_addr_getsockopt_proto; default: return NULL; } default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_storage_get_proto __weak; const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; case BPF_FUNC_sk_cgroup_id: return &bpf_sk_cgroup_id_proto; case BPF_FUNC_sk_ancestor_cgroup_id: return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); } } static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_skb_vlan_push: return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; case BPF_FUNC_skb_change_proto: return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; case BPF_FUNC_skb_adjust_room: return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; case BPF_FUNC_redirect_peer: return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_set_hash: return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_skb_check_mtu_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_skb_cgroup_classid: return &bpf_skb_cgroup_classid_proto; #endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_tc_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_tc_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: return &bpf_get_listener_sock_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_tc_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_skb_ecn_set_ce: return &bpf_skb_ecn_set_ce_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; case BPF_FUNC_skb_set_tstamp: return &bpf_skb_set_tstamp_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_proto; case BPF_FUNC_xdp_load_bytes: return &bpf_xdp_load_bytes_proto; case BPF_FUNC_xdp_store_bytes: return &bpf_xdp_store_bytes_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; case BPF_FUNC_check_mtu: return &bpf_xdp_check_mtu_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_udp: return &bpf_xdp_sk_lookup_udp_proto; case BPF_FUNC_sk_lookup_tcp: return &bpf_xdp_sk_lookup_tcp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_xdp_skc_lookup_tcp_proto; case BPF_FUNC_tcp_check_syncookie: return &bpf_tcp_check_syncookie_proto; case BPF_FUNC_tcp_gen_syncookie: return &bpf_tcp_gen_syncookie_proto; #ifdef CONFIG_SYN_COOKIES case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: return &bpf_tcp_raw_gen_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: return &bpf_tcp_raw_gen_syncookie_ipv6_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv4: return &bpf_tcp_raw_check_syncookie_ipv4_proto; case BPF_FUNC_tcp_raw_check_syncookie_ipv6: return &bpf_tcp_raw_check_syncookie_ipv6_proto; #endif #endif default: return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will * be included in vmlinux BTF, allowing both modules to refer to the * same type ID. */ BTF_TYPE_EMIT(struct nf_conn___init); #endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; func_proto = cgroup_common_func_proto(func_id, prog); if (func_proto) return func_proto; switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; case BPF_FUNC_store_hdr_opt: return &bpf_sock_ops_store_hdr_opt_proto; case BPF_FUNC_reserve_hdr_opt: return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_redirect_hash: return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUP_NET_CLASSID case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_curr_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; case BPF_FUNC_skb_adjust_room: return &sk_skb_adjust_room_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; case BPF_FUNC_sk_lookup_udp: return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; case BPF_FUNC_skc_lookup_tcp: return &bpf_skc_lookup_tcp_proto; #endif default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_csum_level: return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_lwt_push_encap: return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; #endif default: return lwt_out_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): if (size != size_default) return false; break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; case bpf_ctx_range(struct __sk_buff, hwtstamp): if (type == BPF_WRITE || size != sizeof(__u64)) return false; break; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; break; case offsetof(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; case offsetof(struct __sk_buff, tstamp_type): return false; case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { if (size != size_default) return false; } else { bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } } return true; } static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool cg_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, wire_len): return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, enum bpf_attach_type attach_type) { switch (off) { case offsetof(struct bpf_sock, bound_dev_if): case offsetof(struct bpf_sock, mark): case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: goto full_access; default: return false; } case bpf_ctx_range(struct bpf_sock, src_ip4): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): switch (attach_type) { case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range(struct bpf_sock, src_port): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } } read_only: return access_type == BPF_READ; full_access: return true; } bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range_till(struct bpf_sock, type, priority): return false; default: return bpf_sock_is_valid_access(off, size, type, info); } } bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct bpf_sock, state): case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case bpf_ctx_range(struct bpf_sock, dst_port): field_size = size == size_default ? size_default : sizeof_field(struct bpf_sock, dst_port); bpf_ctx_record_field_size(info, field_size); return bpf_ctx_narrow_access_ok(off, size, field_size); case offsetofend(struct bpf_sock, dst_port) ... offsetof(struct bpf_sock, dst_ip4) - 1: return false; } return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (!bpf_sock_is_valid_access(off, size, type, info)) return false; return __sock_filter_check_attach_type(off, type, prog->expected_attach_type); } static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { /* Neither direct read nor direct write requires any preliminary * action. */ return 0; } static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; if (!direct_write) return 0; /* if (!skb->cloned) * goto start; * * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); /* ret = bpf_skb_pull_data(skb, 0); */ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_pull_data); /* if (!ret) * goto restore; * return TC_ACT_SHOT; */ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); *insn++ = BPF_EXIT_INSN(); /* restore: */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); /* start: */ *insn++ = prog->insnsi[0]; return insn - insn_buf; } static int bpf_gen_ld_abs(const struct bpf_insn *orig, struct bpf_insn *insn_buf) { bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf; if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } /* We're guaranteed here that CTX is in R6. */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); switch (BPF_SIZE(orig->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); break; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); *insn++ = BPF_EXIT_INSN(); return insn - insn_buf; } static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); } static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, queue_mapping): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; case offsetof(struct __sk_buff, tstamp_type): /* The convert_ctx_access() on reading and writing * __sk_buff->tstamp depends on whether the bpf prog * has used __sk_buff->tstamp_type or not. * Thus, we need to set prog->tstamp_type_access * earlier during is_valid_access() here. */ ((struct bpf_prog *)prog)->tstamp_type_access = 1; return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); } DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; if (off % size != 0) return false; if (size != sizeof(__u32)) return false; return true; } static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (prog->expected_attach_type != BPF_XDP_DEVMAP) { switch (off) { case offsetof(struct xdp_md, egress_ifindex): return false; } } if (type == BPF_WRITE) { if (bpf_prog_is_offloaded(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); } } return false; } else { switch (off) { case offsetof(struct xdp_md, data_meta): case offsetof(struct xdp_md, data): case offsetof(struct xdp_md, data_end): if (info->is_ldsx) return false; } } switch (off) { case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(const struct net_device *dev, const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", act > act_max ? "Illegal" : "Driver unsupported", act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size) { int ret = -EACCES; mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) ret = nfct_btf_struct_access(log, reg, off, size); mutex_unlock(&nf_conn_btf_access_lock); return ret; } static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_addr)) return false; if (off % size != 0) return false; /* Disallow access to fields not belonging to the attach type's address * family. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; } break; case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; } break; } switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, user_ip6)) return true; if (bpf_ctx_wide_access_ok(off, size, struct bpf_sock_addr, msg_src_ip6)) return true; if (size != size_default) return false; } break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; default: if (type == BPF_READ) { if (size != size_default) return false; } else { return false; } } return true; } static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; default: return false; } } else { switch (off) { case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, bytes_acked): if (size != sizeof(__u64)) return false; break; case offsetof(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; case offsetof(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; case offsetof(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); case offsetof(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; default: if (size != size_default) return false; break; } } return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, mark): return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; if (off % size != 0) return false; switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct sk_msg_md, remote_port): case bpf_ctx_range(struct sk_msg_md, local_port): case bpf_ctx_range(struct sk_msg_md, size): if (size != sizeof(__u32)) return false; break; default: return false; } return true; } static bool flow_dissector_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; if (type == BPF_WRITE) return false; switch (off) { case bpf_ctx_range(struct __sk_buff, data): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET; return true; case bpf_ctx_range(struct __sk_buff, data_end): if (info->is_ldsx || size != size_default) return false; info->reg_type = PTR_TO_PACKET_END; return true; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_FLOW_KEYS; return true; default: return false; } } static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data)); break; case offsetof(struct __sk_buff, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, data_end)); break; case offsetof(struct __sk_buff, flow_keys): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), si->dst_reg, si->src_reg, offsetof(struct bpf_flow_dissector, flow_keys)); break; } return insn - insn_buf; } static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); #else BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); #endif return insn; } static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, struct bpf_insn *insn) { /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), BPF_REG_AX, skb_reg, offsetof(struct sk_buff, end)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), dst_reg, skb_reg, offsetof(struct sk_buff, head)); *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), dst_reg, skb_reg, offsetof(struct sk_buff, end)); #endif return insn; } static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. */ if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* check if ingress mask bits is set */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); *insn++ = BPF_JMP_A(4); *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); *insn++ = BPF_JMP_A(2); /* skb->tc_at_ingress && skb->tstamp_type, * read 0 as the (rcv) timestamp. */ *insn++ = BPF_MOV64_IMM(value_reg, 0); *insn++ = BPF_JMP_A(1); } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, offsetof(struct sk_buff, tstamp)); return insn; } static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * skb->tstamp_type bit also. */ if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); /* Writing __sk_buff->tstamp as ingress, goto <clear> */ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); /* goto <store> */ *insn++ = BPF_JMP_A(2); /* <clear>: skb->tstamp_type */ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); } #endif /* <store>: skb->tstamp = tstamp */ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); return insn; } #define BPF_EMIT_STORE(size, si, off) \ BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ (si)->dst_reg, (si)->src_reg, (off), (si)->imm) static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, len, 4, target_size)); break; case offsetof(struct __sk_buff, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, protocol, 2, target_size)); break; case offsetof(struct __sk_buff, vlan_proto): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_proto, 2, target_size)); break; case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, priority, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, skb_iif, 4, target_size)); break; case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; case offsetof(struct __sk_buff, hash): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, hash, 4, target_size)); break; case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, bpf_target_off(struct sk_buff, mark, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, target_size)); break; case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, PKT_TYPE_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); #endif break; case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ break; } if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, queue_mapping, 2, target_size)); } break; case offsetof(struct __sk_buff, vlan_present): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_all, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); break; case offsetof(struct __sk_buff, vlan_tci): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); off = si->off; off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_meta): off = si->off; off -= offsetof(struct __sk_buff, data_meta); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_meta); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_end); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_H, si, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, napi_id, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_family, 2, target_size)); break; case offsetof(struct __sk_buff, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_daddr, 4, target_size)); break; case offsetof(struct __sk_buff, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, 4, target_size)); break; case offsetof(struct __sk_buff, remote_ip6[0]) ... offsetof(struct __sk_buff, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, local_ip6[0]) ... offsetof(struct __sk_buff, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_dport, 2, target_size)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct __sk_buff, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; case offsetof(struct __sk_buff, tstamp): BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) insn = bpf_convert_tstamp_write(prog, si, insn); else insn = bpf_convert_tstamp_read(prog, si, insn); break; case offsetof(struct __sk_buff, tstamp_type): insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_segs, 2, target_size)); break; case offsetof(struct __sk_buff, gso_size): insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, gso_size, 2, target_size)); break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); off = si->off; off -= offsetof(struct __sk_buff, wire_len); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); break; case offsetof(struct __sk_buff, hwtstamp): BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); break; } return insn - insn_buf; } u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, mark): BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); break; case offsetof(struct bpf_sock, priority): BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); break; case offsetof(struct bpf_sock, family): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_family), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_family, sizeof_field(struct sock_common, skc_family), target_size)); break; case offsetof(struct bpf_sock, type): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_type), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_type, sizeof_field(struct sock, sk_type), target_size)); break; case offsetof(struct bpf_sock, protocol): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_protocol), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_protocol, sizeof_field(struct sock, sk_protocol), target_size)); break; case offsetof(struct bpf_sock, src_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, sizeof_field(struct sock_common, skc_rcv_saddr), target_size)); break; case offsetof(struct bpf_sock, dst_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_daddr, sizeof_field(struct sock_common, skc_daddr), target_size)); break; case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, src_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off( struct sock_common, skc_v6_rcv_saddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]), target_size) + off); #else (void)off; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, dst_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_v6_daddr.s6_addr32[0], sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]), target_size) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); *target_size = 4; #endif break; case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_num, sizeof_field(struct sock_common, skc_num), target_size)); break; case offsetof(struct bpf_sock, dst_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_dport), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_dport, sizeof_field(struct sock_common, skc_dport), target_size)); break; case offsetof(struct bpf_sock, state): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_state), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_state, sizeof_field(struct sock_common, skc_state), target_size)); break; case offsetof(struct bpf_sock, rx_queue_mapping): #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), si->dst_reg, si->src_reg, bpf_target_off(struct sock, sk_rx_queue_mapping, sizeof_field(struct sock, sk_rx_queue_mapping), target_size)); *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); #else *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); *target_size = 2; #endif break; } return insn - insn_buf; } static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_meta): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_meta)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; case offsetof(struct xdp_md, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, queue_index)); break; case offsetof(struct xdp_md, egress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, txq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_txq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; } return insn - insn_buf; } /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of * context Structure, F is Field in context structure that contains a pointer * to Nested Structure of type NS that has the field NF. * * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make * sure that SIZE is not greater than actual size of S.F.NF. * * If offset OFF is provided, the load happens from that offset relative to * offset of NF. */ #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ do { \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ si->src_reg, offsetof(S, F)); \ *insn++ = BPF_LDX_MEM( \ SIZE, si->dst_reg, si->dst_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF); \ } while (0) #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ BPF_FIELD_SIZEOF(NS, NF), 0) /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor * DST since it contains pointer to context that may be used by later * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, sizeof_field(NS, NF), \ target_size) \ + OFF, \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ TF) \ do { \ if (type == BPF_WRITE) { \ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ OFF, TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ } \ } while (0) static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr, uaddr, sa_family); break; case offsetof(struct bpf_sock_addr, user_ip4): SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, sin_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, user_ip6[0]); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, user_port): /* To get port we need to know sa_family first and then treat * sockaddr as either sockaddr_in or sockaddr_in6. * Though we can simplify since port field has same offset and * size in both structures. * Here we check this invariant and use just one of the * structures if it's true. */ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); /* Account for sin6_port being smaller than user_port. */ port_size = min(port_size, BPF_LDST_BYTES(si)); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_family); break; case offsetof(struct bpf_sock_addr, type): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_type); break; case offsetof(struct bpf_sock_addr, protocol): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_protocol); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): /* Treat t_ctx as struct in_addr for msg_src_ip4. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in_addr, t_ctx, s_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_addr_kern, sk)); break; } return insn - insn_buf; } static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ OBJ_FIELD), \ si->dst_reg, si->dst_reg, \ offsetof(OBJ, OBJ_FIELD)); \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_SK() \ do { \ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ fullsock_reg = reg; \ jmp += 2; \ } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ if (si->dst_reg == si->src_reg) { \ *insn++ = BPF_JMP_A(1); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } \ } while (0) #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) /* Helper macro for adding write access to tcp_sock or sock fields. * The macro is called with two registers, dst_reg which contains a pointer * to ctx (context) and src_reg which contains the value that should be * stored. However, we need an additional register since we cannot overwrite * dst_reg because it may be used later in the program. * Instead we "borrow" one of the other register. We first save its value * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore * it at the end of the macro. */ #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int reg = BPF_REG_9; \ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ BPF_MEM | BPF_CLASS(si->code), \ reg, si->src_reg, \ offsetof(OBJ, OBJ_FIELD), \ si->imm); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } while (0) #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ do { \ if (TYPE == BPF_WRITE) \ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ else \ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) switch (si->off) { case offsetof(struct bpf_sock_ops, op): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, op), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, op)); break; case offsetof(struct bpf_sock_ops, replylong[0]) ... offsetof(struct bpf_sock_ops, replylong[3]): BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != sizeof_field(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != sizeof_field(struct bpf_sock_ops_kern, replylong)); off = si->off; off -= offsetof(struct bpf_sock_ops, replylong[0]); off += offsetof(struct bpf_sock_ops_kern, replylong[0]); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_W, si, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct bpf_sock_ops, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct bpf_sock_ops, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct bpf_sock_ops, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... offsetof(struct bpf_sock_ops, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, local_ip6[0]) ... offsetof(struct bpf_sock_ops, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct bpf_sock_ops, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct bpf_sock_ops, is_fullsock): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, is_fullsock), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, is_fullsock)); break; case offsetof(struct bpf_sock_ops, state): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_state)); break; case offsetof(struct bpf_sock_ops, rtt_min): BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct tcp_sock, rtt_min) + sizeof_field(struct minmax_sample, t)); break; case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); break; case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); break; case offsetof(struct bpf_sock_ops, snd_ssthresh): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); break; case offsetof(struct bpf_sock_ops, rcv_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); break; case offsetof(struct bpf_sock_ops, snd_nxt): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); break; case offsetof(struct bpf_sock_ops, snd_una): SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); break; case offsetof(struct bpf_sock_ops, mss_cache): SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); break; case offsetof(struct bpf_sock_ops, ecn_flags): SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); break; case offsetof(struct bpf_sock_ops, rate_delivered): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); break; case offsetof(struct bpf_sock_ops, rate_interval_us): SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); break; case offsetof(struct bpf_sock_ops, packets_out): SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); break; case offsetof(struct bpf_sock_ops, retrans_out): SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); break; case offsetof(struct bpf_sock_ops, total_retrans): SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); break; case offsetof(struct bpf_sock_ops, segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); break; case offsetof(struct bpf_sock_ops, data_segs_in): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); break; case offsetof(struct bpf_sock_ops, segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); break; case offsetof(struct bpf_sock_ops, data_segs_out): SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); break; case offsetof(struct bpf_sock_ops, lost_out): SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); break; case offsetof(struct bpf_sock_ops, sacked_out): SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); break; case offsetof(struct bpf_sock_ops, bytes_received): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); break; case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); break; case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; case offsetof(struct bpf_sock_ops, skb_data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb_data_end), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb_data_end)); break; case offsetof(struct bpf_sock_ops, skb_data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct bpf_sock_ops, skb_len): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), si->dst_reg, si->dst_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct bpf_sock_ops, skb_tcp_flags): off = offsetof(struct sk_buff, cb); off += offsetof(struct tcp_skb_cb, tcp_flags); *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, tcp_flags), si->dst_reg, si->dst_reg, off); break; case offsetof(struct bpf_sock_ops, skb_hwtstamp): { struct bpf_insn *jmp_on_null_skb; *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, skb), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, skb)); /* Reserve one insn to test skb == NULL */ jmp_on_null_skb = insn++; insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, hwtstamps, 8, target_size)); *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, insn - jmp_on_null_skb - 1); break; } } return insn - insn_buf; } /* data_end = skb->data + skb_headlen() */ static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, struct bpf_insn *insn) { int reg; int temp_reg_off = offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, temp_reg); if (si->src_reg == si->dst_reg) { /* We need an extra register, choose and save a register. */ reg = BPF_REG_9; if (si->src_reg == reg || si->dst_reg == reg) reg--; if (si->src_reg == reg || si->dst_reg == reg) reg--; *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); } else { reg = si->dst_reg; } /* reg = skb->data */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), reg, si->src_reg, offsetof(struct sk_buff, data)); /* AX = skb->len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, len)); /* reg = skb->data + skb->len */ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); /* AX = skb->data_len */ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), BPF_REG_AX, si->src_reg, offsetof(struct sk_buff, data_len)); /* reg = skb->data + skb->len - skb->data_len */ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); if (si->src_reg == si->dst_reg) { /* Restore the saved register */ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); *insn++ = BPF_MOV64_REG(si->dst_reg, reg); *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); } return insn; } static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): insn = bpf_convert_data_end_access(si, insn); break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct sk_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct sk_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #if IS_ENABLED(CONFIG_IPV6) int off; #endif /* convert ctx uses the fact sg element is first in struct */ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data)); break; case offsetof(struct sk_msg_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), si->dst_reg, si->src_reg, offsetof(struct sk_msg, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct sk_msg_md, remote_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct sk_msg_md, local_ip4): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct sk_msg_md, remote_ip6[0]) ... offsetof(struct sk_msg_md, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, local_ip6[0]) ... offsetof(struct sk_msg_md, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(sizeof_field(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, remote_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct sk_msg_md, local_port): BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct sk_msg_md, size): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; case offsetof(struct sk_msg_md, sk): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg, sk)); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = cg_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_in_verifier_ops = { .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_in_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_out_verifier_ops = { .get_func_proto = lwt_out_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, }; const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { .get_func_proto = lwt_seg6local_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { }; const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { }; const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { .get_func_proto = sock_addr_func_proto, .is_valid_access = sock_addr_is_valid_access, .convert_ctx_access = sock_addr_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_addr_prog_ops = { }; const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; const struct bpf_prog_ops sock_ops_prog_ops = { }; const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; const struct bpf_prog_ops sk_skb_prog_ops = { }; const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { }; const struct bpf_verifier_ops flow_dissector_verifier_ops = { .get_func_proto = flow_dissector_func_proto, .is_valid_access = flow_dissector_is_valid_access, .convert_ctx_access = flow_dissector_convert_ctx_access, }; const struct bpf_prog_ops flow_dissector_prog_ops = { .test_run = bpf_prog_test_run_flow_dissector, }; int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) goto out; /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. eBPF programs that * have no original program cannot be dumped through this. */ ret = -EACCES; fprog = filter->prog->orig_prog; if (!fprog) goto out; ret = fprog->len; if (!len) /* User space only enquires number of filter blocks. */ goto out; ret = -EINVAL; if (len < fprog->len) goto out; ret = -EFAULT; if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number * of filter blocks. */ ret = fprog->len; out: sockopt_release_sock(sk); return ret; } #ifdef CONFIG_INET static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; reuse_kern->migrating_sk = migrating_sk; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; reuse_kern->bind_inany = reuse->bind_inany; } struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; else return ERR_PTR(-ECONNREFUSED); } BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, struct bpf_map *, map, void *, key, u32, flags) { bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) return -ENOENT; reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. * * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ err = is_sockarray ? -ENOENT : -EINVAL; goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; if (sk->sk_protocol != selected_sk->sk_protocol) { err = -EPROTOTYPE; } else if (sk->sk_family != selected_sk->sk_family) { err = -EAFNOSUPPORT; } else { /* Catch all. Likely bound to a different sockaddr. */ err = -EBADFD; } goto error; } reuse_kern->selected_sk = selected_sk; return 0; error: /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ if (sk_is_refcounted(selected_sk)) sock_put(selected_sk); return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { .func = sk_select_reuseport, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(sk_reuseport_load_bytes, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len) { return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); } static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { .func = sk_reuseport_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(sk_reuseport_load_bytes_relative, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len, u32, start_header) { return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, len, start_header); } static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { .func = sk_reuseport_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_reuseport_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sk_select_reuseport: return &sk_select_reuseport_proto; case BPF_FUNC_skb_load_bytes: return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } } static bool sk_reuseport_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const u32 size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct sk_reuseport_md) || off % size || type != BPF_READ) return false; switch (off) { case offsetof(struct sk_reuseport_md, data): info->reg_type = PTR_TO_PACKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, data_end): info->reg_type = PTR_TO_PACKET_END; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, hash): return size == size_default; case offsetof(struct sk_reuseport_md, sk): info->reg_type = PTR_TO_SOCKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, migrating_sk): info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; return size == sizeof(__u64); /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < sizeof_field(struct sk_buff, protocol)) return false; fallthrough; case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): case bpf_ctx_range(struct sk_reuseport_md, bind_inany): case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); default: return false; } } #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ si->dst_reg, si->src_reg, \ bpf_target_off(struct sk_reuseport_kern, F, \ sizeof_field(struct sk_reuseport_kern, F), \ target_size)); \ }) #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sk_buff, \ skb, \ SKB_FIELD) #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sock, \ sk, \ SK_FIELD) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct sk_reuseport_md, data): SK_REUSEPORT_LOAD_SKB_FIELD(data); break; case offsetof(struct sk_reuseport_md, len): SK_REUSEPORT_LOAD_SKB_FIELD(len); break; case offsetof(struct sk_reuseport_md, eth_protocol): SK_REUSEPORT_LOAD_SKB_FIELD(protocol); break; case offsetof(struct sk_reuseport_md, ip_protocol): SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); break; case offsetof(struct sk_reuseport_md, data_end): SK_REUSEPORT_LOAD_FIELD(data_end); break; case offsetof(struct sk_reuseport_md, hash): SK_REUSEPORT_LOAD_FIELD(hash); break; case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; case offsetof(struct sk_reuseport_md, sk): SK_REUSEPORT_LOAD_FIELD(sk); break; case offsetof(struct sk_reuseport_md, migrating_sk): SK_REUSEPORT_LOAD_FIELD(migrating_sk); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_reuseport_verifier_ops = { .get_func_proto = sk_reuseport_func_proto, .is_valid_access = sk_reuseport_is_valid_access, .convert_ctx_access = sk_reuseport_convert_ctx_access, }; const struct bpf_prog_ops sk_reuseport_prog_ops = { }; DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); EXPORT_SYMBOL(bpf_sk_lookup_enabled); BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, struct sock *, sk, u64, flags) { if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | BPF_SK_LOOKUP_F_NO_REUSEPORT))) return -EINVAL; if (unlikely(sk && sk_is_refcounted(sk))) return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ /* Check if socket is suitable for packet L3/L4 protocol */ if (sk && sk->sk_protocol != ctx->protocol) return -EPROTOTYPE; if (sk && sk->sk_family != ctx->family && (sk->sk_family == AF_INET || ipv6_only_sock(sk))) return -EAFNOSUPPORT; if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) return -EEXIST; /* Select socket as lookup result */ ctx->selected_sk = sk; ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; return 0; } static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { .func = bpf_sk_lookup_assign, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, .arg3_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_assign: return &bpf_sk_lookup_assign_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: return bpf_sk_base_func_proto(func_id, prog); } } static bool sk_lookup_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) return false; if (off % size != 0) return false; if (type != BPF_READ) return false; switch (off) { case offsetof(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); case bpf_ctx_range(struct bpf_sk_lookup, family): case bpf_ctx_range(struct bpf_sk_lookup, protocol): case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): case bpf_ctx_range(struct bpf_sk_lookup, local_port): case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); case bpf_ctx_range(struct bpf_sk_lookup, remote_port): /* Allow 4-byte access to 2-byte field for backward compatibility */ if (size == sizeof(__u32)) return true; bpf_ctx_record_field_size(info, sizeof(__be16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); case offsetofend(struct bpf_sk_lookup, remote_port) ... offsetof(struct bpf_sk_lookup, local_ip4) - 1: /* Allow access to zero padding for backward compatibility */ bpf_ctx_record_field_size(info, sizeof(__u16)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); default: return false; } } static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_sk_lookup, sk): *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, selected_sk)); break; case offsetof(struct bpf_sk_lookup, family): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, family, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, protocol, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, remote_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.saddr, 4, target_size)); break; case offsetof(struct bpf_sk_lookup, local_ip4): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, v4.daddr, 4, target_size)); break; case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.saddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): { #if IS_ENABLED(CONFIG_IPV6) int off = si->off; off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, offsetof(struct bpf_sk_lookup_kern, v6.daddr)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; } case offsetof(struct bpf_sk_lookup, remote_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, sport, 2, target_size)); break; case offsetofend(struct bpf_sk_lookup, remote_port): *target_size = 2; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); break; case offsetof(struct bpf_sk_lookup, local_port): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, dport, 2, target_size)); break; case offsetof(struct bpf_sk_lookup, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct bpf_sk_lookup_kern, ingress_ifindex, 4, target_size)); break; } return insn - insn_buf; } const struct bpf_prog_ops sk_lookup_prog_ops = { .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { .get_func_proto = sk_lookup_func_proto, .is_valid_access = sk_lookup_is_valid_access, .convert_ctx_access = sk_lookup_convert_ctx_access, }; #endif /* CONFIG_INET */ DEFINE_BPF_DISPATCHER(xdp) void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) { bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { /* tcp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { .func = bpf_skc_to_tcp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], }; BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { .func = bpf_skc_to_tcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], }; BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) { /* BTF types for tcp_timewait_sock and inet_timewait_sock are not * generated if CONFIG_INET=n. Trigger an explicit generation here. */ BTF_TYPE_EMIT(struct inet_timewait_sock); BTF_TYPE_EMIT(struct tcp_timewait_sock); #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { .func = bpf_skc_to_tcp_timewait_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], }; BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) { #ifdef CONFIG_INET if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif #if IS_BUILTIN(CONFIG_IPV6) if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) return (unsigned long)sk; #endif return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { .func = bpf_skc_to_tcp_request_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], }; BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) { /* udp6_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct udp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .func = bpf_skc_to_udp6_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) { /* unix_sock type is not generated in dwarf and hence btf, * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct unix_sock); if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) return (unsigned long)sk; return (unsigned long)NULL; } const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { .func = bpf_skc_to_unix_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], }; BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) { BTF_TYPE_EMIT(struct mptcp_sock); return (unsigned long)bpf_mptcp_sock_from_subflow(sk); } const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { .func = bpf_skc_to_mptcp_sock, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .arg1_type = ARG_PTR_TO_SOCK_COMMON, .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], }; BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); } BTF_ID_LIST(bpf_sock_from_file_btf_ids) BTF_ID(struct, socket) BTF_ID(struct, file) const struct bpf_func_proto bpf_sock_from_file_proto = { .func = bpf_sock_from_file, .gpl_only = false, .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, .ret_btf_id = &bpf_sock_from_file_btf_ids[0], .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], }; static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; switch (func_id) { case BPF_FUNC_skc_to_tcp6_sock: func = &bpf_skc_to_tcp6_sock_proto; break; case BPF_FUNC_skc_to_tcp_sock: func = &bpf_skc_to_tcp_sock_proto; break; case BPF_FUNC_skc_to_tcp_timewait_sock: func = &bpf_skc_to_tcp_timewait_sock_proto; break; case BPF_FUNC_skc_to_tcp_request_sock: func = &bpf_skc_to_tcp_request_sock_proto; break; case BPF_FUNC_skc_to_udp6_sock: func = &bpf_skc_to_udp6_sock_proto; break; case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; case BPF_FUNC_skc_to_mptcp_sock: func = &bpf_skc_to_mptcp_sock_proto; break; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id, prog); } if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; } __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct sk_buff *skb = (struct sk_buff *)s; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); return 0; } __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; struct xdp_buff *xdp = (struct xdp_buff *)x; if (flags) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); return 0; } __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, const u8 *sun_path, u32 sun_path__sz) { struct sockaddr_un *un; if (sa_kern->sk->sk_family != AF_UNIX) return -EINVAL; /* We do not allow changing the address to unnamed or larger than the * maximum allowed address size for a unix sockaddr. */ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) return -EINVAL; un = (struct sockaddr_un *)sa_kern->uaddr; memcpy(un->sun_path, sun_path, sun_path__sz); sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; return 0; } __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, struct bpf_tcp_req_attrs *attrs, int attrs__sz) { #if IS_ENABLED(CONFIG_SYN_COOKIES) struct sk_buff *skb = (struct sk_buff *)s; const struct request_sock_ops *ops; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct request_sock *req; struct net *net; __u16 min_mss; u32 tsoff = 0; if (attrs__sz != sizeof(*attrs) || attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EINVAL; net = dev_net(skb->dev); if (net != sock_net(sk)) return -ENETUNREACH; switch (skb->protocol) { case htons(ETH_P_IP): ops = &tcp_request_sock_ops; min_mss = 536; break; #if IS_BUILTIN(CONFIG_IPV6) case htons(ETH_P_IPV6): ops = &tcp6_request_sock_ops; min_mss = IPV6_MIN_MTU - 60; break; #endif default: return -EINVAL; } if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || sk_is_mptcp(sk)) return -EINVAL; if (attrs->mss < min_mss) return -EINVAL; if (attrs->wscale_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) return -EINVAL; if (attrs->snd_wscale > TCP_MAX_WSCALE || attrs->rcv_wscale > TCP_MAX_WSCALE) return -EINVAL; } if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) return -EINVAL; if (attrs->tstamp_ok) { if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) return -EINVAL; tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); } req = inet_reqsk_alloc(ops, sk, false); if (!req) return -ENOMEM; ireq = inet_rsk(req); treq = tcp_rsk(req); req->rsk_listener = sk; req->syncookie = 1; req->mss = attrs->mss; req->ts_recent = attrs->rcv_tsval; ireq->snd_wscale = attrs->snd_wscale; ireq->rcv_wscale = attrs->rcv_wscale; ireq->tstamp_ok = !!attrs->tstamp_ok; ireq->sack_ok = !!attrs->sack_ok; ireq->wscale_ok = !!attrs->wscale_ok; ireq->ecn_ok = !!attrs->ecn_ok; treq->req_usec_ts = !!attrs->usec_ts_ok; treq->ts_off = tsoff; skb_orphan(skb); skb->sk = req_to_sk(req); skb->destructor = sock_pfree; return 0; #else return -EOPNOTSUPP; #endif } __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, u64 flags) { struct sk_buff *skb; if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) return -EOPNOTSUPP; if (flags) return -EINVAL; skb = skops->skb; skb_shinfo(skb)->tx_flags |= SKBTX_BPF; TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; return 0; } __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, struct bpf_dynptr *ptr__uninit) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; int err; err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); if (err) return err; bpf_dynptr_set_rdonly(ptr); return 0; } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, }; static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, }; static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_sock_addr, }; static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_tcp_reqsk, }; static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_sock_ops, }; static int __init bpf_kfunc_init(void) { int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); __bpf_kfunc_start_defs(); /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. * * The function expects a non-NULL pointer to a socket, and invokes the * protocol specific socket destroy handlers. * * The helper can only be called from BPF contexts that have acquired the socket * locks. * * Parameters: * @sock: Pointer to socket to be destroyed * * Return: * On error, may return EPROTONOSUPPORT, EINVAL. * EPROTONOSUPPORT if protocol specific destroy handler is not supported. * 0 otherwise */ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) { struct sock *sk = (struct sock *)sock; /* The locking semantics that allow for synchronous execution of the * destroy handlers are only supported for TCP and UDP. * Supporting protocols will need to acquire sock lock in the BPF context * prior to invoking this kfunc. */ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_UDP)) return -EOPNOTSUPP; return sk->sk_prot->diag_destroy(sk, ECONNABORTED); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) { if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && prog->expected_attach_type != BPF_TRACE_ITER) return -EACCES; return 0; } static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_sk_iter_kfunc_ids, .filter = tracing_iter_filter, }; static int init_subsystem(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); } late_initcall(init_subsystem);
26 26 449 317 154 155 26 1 1 1 349 173 449 29 201 378 452 7 448 320 39 173 4 173 21 160 162 18 142 10 10 238 238 4 238 349 145 25 7 18 446 446 446 1 448 449 301 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/mm/fault.c * * Copyright (C) 1995 Linus Torvalds * Copyright (C) 1995-2004 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/acpi.h> #include <linux/bitfield.h> #include <linux/extable.h> #include <linux/kfence.h> #include <linux/signal.h> #include <linux/mm.h> #include <linux/hardirq.h> #include <linux/init.h> #include <linux/kasan.h> #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/page-flags.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/highmem.h> #include <linux/perf_event.h> #include <linux/pkeys.h> #include <linux/preempt.h> #include <linux/hugetlb.h> #include <asm/acpi.h> #include <asm/bug.h> #include <asm/cmpxchg.h> #include <asm/cpufeature.h> #include <asm/efi.h> #include <asm/exception.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/esr.h> #include <asm/kprobes.h> #include <asm/mte.h> #include <asm/processor.h> #include <asm/sysreg.h> #include <asm/system_misc.h> #include <asm/tlbflush.h> #include <asm/traps.h> struct fault_info { int (*fn)(unsigned long far, unsigned long esr, struct pt_regs *regs); int sig; int code; const char *name; }; static const struct fault_info fault_info[]; static struct fault_info debug_fault_info[]; static inline const struct fault_info *esr_to_fault_info(unsigned long esr) { return fault_info + (esr & ESR_ELx_FSC); } static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr) { return debug_fault_info + DBG_ESR_EVT(esr); } static void data_abort_decode(unsigned long esr) { unsigned long iss2 = ESR_ELx_ISS2(esr); pr_alert("Data abort info:\n"); if (esr & ESR_ELx_ISV) { pr_alert(" Access size = %u byte(s)\n", 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT)); pr_alert(" SSE = %lu, SRT = %lu\n", (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT, (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT); pr_alert(" SF = %lu, AR = %lu\n", (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT, (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT); } else { pr_alert(" ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n", esr & ESR_ELx_ISS_MASK, iss2); } pr_alert(" CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n", (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT, (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT, (iss2 & ESR_ELx_TnD) >> ESR_ELx_TnD_SHIFT, (iss2 & ESR_ELx_TagAccess) >> ESR_ELx_TagAccess_SHIFT); pr_alert(" GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n", (iss2 & ESR_ELx_GCS) >> ESR_ELx_GCS_SHIFT, (iss2 & ESR_ELx_Overlay) >> ESR_ELx_Overlay_SHIFT, (iss2 & ESR_ELx_DirtyBit) >> ESR_ELx_DirtyBit_SHIFT, (iss2 & ESR_ELx_Xs_MASK) >> ESR_ELx_Xs_SHIFT); } static void mem_abort_decode(unsigned long esr) { pr_alert("Mem abort info:\n"); pr_alert(" ESR = 0x%016lx\n", esr); pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n", ESR_ELx_EC(esr), esr_get_class_string(esr), (esr & ESR_ELx_IL) ? 32 : 16); pr_alert(" SET = %lu, FnV = %lu\n", (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT, (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT); pr_alert(" EA = %lu, S1PTW = %lu\n", (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT, (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT); pr_alert(" FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC), esr_to_fault_info(esr)->name); if (esr_is_data_abort(esr)) data_abort_decode(esr); } static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm) { /* Either init_pg_dir or swapper_pg_dir */ if (mm == &init_mm) return __pa_symbol(mm->pgd); return (unsigned long)virt_to_phys(mm->pgd); } /* * Dump out the page tables associated with 'addr' in the currently active mm. */ static void show_pte(unsigned long addr) { struct mm_struct *mm; pgd_t *pgdp; pgd_t pgd; if (is_ttbr0_addr(addr)) { /* TTBR0 */ mm = current->active_mm; if (mm == &init_mm) { pr_alert("[%016lx] user address but active_mm is swapper\n", addr); return; } } else if (is_ttbr1_addr(addr)) { /* TTBR1 */ mm = &init_mm; } else { pr_alert("[%016lx] address between user and kernel address ranges\n", addr); return; } pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n", mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K, vabits_actual, mm_to_pgd_phys(mm)); pgdp = pgd_offset(mm, addr); pgd = READ_ONCE(*pgdp); pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd)); do { p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; if (pgd_none(pgd) || pgd_bad(pgd)) break; p4dp = p4d_offset(pgdp, addr); p4d = READ_ONCE(*p4dp); pr_cont(", p4d=%016llx", p4d_val(p4d)); if (p4d_none(p4d) || p4d_bad(p4d)) break; pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); pr_cont(", pud=%016llx", pud_val(pud)); if (pud_none(pud) || pud_bad(pud)) break; pmdp = pmd_offset(pudp, addr); pmd = READ_ONCE(*pmdp); pr_cont(", pmd=%016llx", pmd_val(pmd)); if (pmd_none(pmd) || pmd_bad(pmd)) break; ptep = pte_offset_map(pmdp, addr); if (!ptep) break; pte = __ptep_get(ptep); pr_cont(", pte=%016llx", pte_val(pte)); pte_unmap(ptep); } while(0); pr_cont("\n"); } /* * This function sets the access flags (dirty, accessed), as well as write * permission, and only to a more permissive setting. * * It needs to cope with hardware update of the accessed/dirty state by other * agents in the system and can safely skip the __sync_icache_dcache() call as, * like __set_ptes(), the PTE is never changed from no-exec to exec here. * * Returns whether or not the PTE actually changed. */ int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { pteval_t old_pteval, pteval; pte_t pte = __ptep_get(ptep); if (pte_same(pte, entry)) return 0; /* only preserve the access flags and write permission */ pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; /* * Setting the flags must be done atomically to avoid racing with the * hardware update of the access/dirty state. The PTE_RDONLY bit must * be set to the most permissive (lowest value) of *ptep and entry * (calculated as: a & b == ~(~a | ~b)). */ pte_val(entry) ^= PTE_RDONLY; pteval = pte_val(pte); do { old_pteval = pteval; pteval ^= PTE_RDONLY; pteval |= pte_val(entry); pteval ^= PTE_RDONLY; pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); } while (pteval != old_pteval); /* Invalidate a stale read-only entry */ if (dirty) flush_tlb_page(vma, address); return 1; } static bool is_el1_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; } static bool is_el1_data_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR; } static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr)) return false; if (esr_fsc_is_permission_fault(esr)) return true; if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan()) return esr_fsc_is_translation_fault(esr) && (regs->pstate & PSR_PAN_BIT); return false; } static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { unsigned long flags; u64 par, dfsc; if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr)) return false; local_irq_save(flags); asm volatile("at s1e1r, %0" :: "r" (addr)); isb(); par = read_sysreg_par(); local_irq_restore(flags); /* * If we now have a valid translation, treat the translation fault as * spurious. */ if (!(par & SYS_PAR_EL1_F)) return true; /* * If we got a different type of fault from the AT instruction, * treat the translation fault as spurious. */ dfsc = FIELD_GET(SYS_PAR_EL1_FST, par); return !esr_fsc_is_translation_fault(dfsc); } static void die_kernel_fault(const char *msg, unsigned long addr, unsigned long esr, struct pt_regs *regs) { bust_spinlocks(1); pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg, addr); kasan_non_canonical_hook(addr); mem_abort_decode(esr); show_pte(addr); die("Oops", regs, esr); bust_spinlocks(0); make_task_dead(SIGKILL); } #ifdef CONFIG_KASAN_HW_TAGS static void report_tag_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { /* * SAS bits aren't set for all faults reported in EL1, so we can't * find out access size. */ bool is_write = !!(esr & ESR_ELx_WNR); kasan_report((void *)addr, 0, is_write, regs->pc); } #else /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */ static inline void report_tag_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { } #endif static void do_tag_recovery(unsigned long addr, unsigned long esr, struct pt_regs *regs) { report_tag_fault(addr, esr, regs); /* * Disable MTE Tag Checking on the local CPU for the current EL. * It will be done lazily on the other CPUs when they will hit a * tag fault. */ sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK, SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE)); isb(); } static bool is_el1_mte_sync_tag_check_fault(unsigned long esr) { unsigned long fsc = esr & ESR_ELx_FSC; if (!is_el1_data_abort(esr)) return false; if (fsc == ESR_ELx_FSC_MTE) return true; return false; } static void __do_kernel_fault(unsigned long addr, unsigned long esr, struct pt_regs *regs) { const char *msg; /* * Are we prepared to handle this kernel fault? * We are almost certainly not prepared to handle instruction faults. */ if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr)) return; if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr)) return; if (is_el1_mte_sync_tag_check_fault(esr)) { do_tag_recovery(addr, esr, regs); return; } if (is_el1_permission_fault(addr, esr, regs)) { if (esr & ESR_ELx_WNR) msg = "write to read-only memory"; else if (is_el1_instruction_abort(esr)) msg = "execute from non-executable memory"; else msg = "read from unreadable memory"; } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { if (esr_fsc_is_translation_fault(esr) && kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) return; msg = "paging request"; } if (efi_runtime_fixup_exception(regs, msg)) return; die_kernel_fault(msg, addr, esr, regs); } static void set_thread_esr(unsigned long address, unsigned long esr) { current->thread.fault_address = address; /* * If the faulting address is in the kernel, we must sanitize the ESR. * From userspace's point of view, kernel-only mappings don't exist * at all, so we report them as level 0 translation faults. * (This is not quite the way that "no mapping there at all" behaves: * an alignment fault not caused by the memory type would take * precedence over translation fault for a real access to empty * space. Unfortunately we can't easily distinguish "alignment fault * not caused by memory type" from "alignment fault caused by memory * type", so we ignore this wrinkle and just return the translation * fault.) */ if (!is_ttbr0_addr(current->thread.fault_address)) { switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_DABT_LOW: /* * These bits provide only information about the * faulting instruction, which userspace knows already. * We explicitly clear bits which are architecturally * RES0 in case they are given meanings in future. * We always report the ESR as if the fault was taken * to EL1 and so ISV and the bits in ISS[23:14] are * clear. (In fact it always will be a fault to EL1.) */ esr &= ESR_ELx_EC_MASK | ESR_ELx_IL | ESR_ELx_CM | ESR_ELx_WNR; esr |= ESR_ELx_FSC_FAULT; break; case ESR_ELx_EC_IABT_LOW: /* * Claim a level 0 translation fault. * All other bits are architecturally RES0 for faults * reported with that DFSC value, so we clear them. */ esr &= ESR_ELx_EC_MASK | ESR_ELx_IL; esr |= ESR_ELx_FSC_FAULT; break; default: /* * This should never happen (entry.S only brings us * into this code for insn and data aborts from a lower * exception level). Fail safe by not providing an ESR * context record at all. */ WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr); esr = 0; break; } } current->thread.fault_code = esr; } static void do_bad_area(unsigned long far, unsigned long esr, struct pt_regs *regs) { unsigned long addr = untagged_addr(far); /* * If we are in kernel mode at this point, we have no context to * handle this fault with. */ if (user_mode(regs)) { const struct fault_info *inf = esr_to_fault_info(esr); set_thread_esr(addr, esr); arm64_force_sig_fault(inf->sig, inf->code, far, inf->name); } else { __do_kernel_fault(addr, esr, regs); } } static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, unsigned int mm_flags) { unsigned long iss2 = ESR_ELx_ISS2(esr); if (!system_supports_poe()) return false; if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay)) return true; return !arch_vma_access_permitted(vma, mm_flags & FAULT_FLAG_WRITE, mm_flags & FAULT_FLAG_INSTRUCTION, false); } static bool is_gcs_fault(unsigned long esr) { if (!esr_is_data_abort(esr)) return false; return ESR_ELx_ISS2(esr) & ESR_ELx_GCS; } static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; } /* * Note: not valid for EL1 DC IVAC, but we never use that such that it * should fault. EL0 cannot issue DC IVAC (undef). */ static bool is_write_abort(unsigned long esr) { return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr) { if (!system_supports_gcs()) return false; if (unlikely(is_gcs_fault(esr))) { /* GCS accesses must be performed on a GCS page */ if (!(vma->vm_flags & VM_SHADOW_STACK)) return true; } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) { /* Only GCS operations can write to a GCS page */ return esr_is_data_abort(esr) && is_write_abort(esr); } return false; } static int __kprobes do_page_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf; struct mm_struct *mm = current->mm; vm_fault_t fault; unsigned long vm_flags; unsigned int mm_flags = FAULT_FLAG_DEFAULT; unsigned long addr = untagged_addr(far); struct vm_area_struct *vma; int si_code; int pkey = -1; if (kprobe_page_fault(regs, esr)) return 0; /* * If we're in an interrupt or have no user context, we must not take * the fault. */ if (faulthandler_disabled() || !mm) goto no_context; if (user_mode(regs)) mm_flags |= FAULT_FLAG_USER; /* * vm_flags tells us what bits we must have in vma->vm_flags * for the fault to be benign, __do_page_fault() would check * vma->vm_flags & vm_flags and returns an error if the * intersection is empty */ if (is_el0_instruction_abort(esr)) { /* It was exec fault */ vm_flags = VM_EXEC; mm_flags |= FAULT_FLAG_INSTRUCTION; } else if (is_gcs_fault(esr)) { /* * The GCS permission on a page implies both read and * write so always handle any GCS fault as a write fault, * we need to trigger CoW even for GCS reads. */ vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } else if (is_write_abort(esr)) { /* It was write fault */ vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } else { /* It was read fault */ vm_flags = VM_READ; /* Write implies read */ vm_flags |= VM_WRITE; /* If EPAN is absent then exec implies read */ if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN)) vm_flags |= VM_EXEC; } if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) { if (is_el1_instruction_abort(esr)) die_kernel_fault("execution of user memory", addr, esr, regs); if (!insn_may_access_user(regs->pc, esr)) die_kernel_fault("access to user memory outside uaccess routines", addr, esr, regs); } perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (!(mm_flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, addr); if (!vma) goto lock_mmap; if (is_invalid_gcs_access(vma, esr)) { vma_end_read(vma); fault = 0; si_code = SEGV_ACCERR; goto bad_area; } if (!(vma->vm_flags & vm_flags)) { vma_end_read(vma); fault = 0; si_code = SEGV_ACCERR; count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto bad_area; } if (fault_from_pkey(esr, vma, mm_flags)) { pkey = vma_pkey(vma); vma_end_read(vma); fault = 0; si_code = SEGV_PKUERR; count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto bad_area; } fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault & VM_FAULT_MAJOR) mm_flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) goto no_context; return 0; } lock_mmap: retry: vma = lock_mm_and_find_vma(mm, addr, regs); if (unlikely(!vma)) { fault = 0; si_code = SEGV_MAPERR; goto bad_area; } if (!(vma->vm_flags & vm_flags)) { mmap_read_unlock(mm); fault = 0; si_code = SEGV_ACCERR; goto bad_area; } if (fault_from_pkey(esr, vma, mm_flags)) { pkey = vma_pkey(vma); mmap_read_unlock(mm); fault = 0; si_code = SEGV_PKUERR; goto bad_area; } fault = handle_mm_fault(vma, addr, mm_flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) goto no_context; return 0; } /* The fault is fully completed (including releasing mmap lock) */ if (fault & VM_FAULT_COMPLETED) return 0; if (fault & VM_FAULT_RETRY) { mm_flags |= FAULT_FLAG_TRIED; goto retry; } mmap_read_unlock(mm); done: /* Handle the "normal" (no error) case first. */ if (likely(!(fault & VM_FAULT_ERROR))) return 0; si_code = SEGV_MAPERR; bad_area: /* * If we are in kernel mode at this point, we have no context to * handle this fault with. */ if (!user_mode(regs)) goto no_context; if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return to * userspace (which will retry the fault, or kill us if we got * oom-killed). */ pagefault_out_of_memory(); return 0; } inf = esr_to_fault_info(esr); set_thread_esr(addr, esr); if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to successfully fix up * this page fault. */ arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { unsigned int lsb; lsb = PAGE_SHIFT; if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { /* * The pkey value that we return to userspace can be different * from the pkey that caused the fault. * * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ /* Something tried to access memory that out of memory map */ if (si_code == SEGV_PKUERR) arm64_force_sig_fault_pkey(far, inf->name, pkey); else arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name); } return 0; no_context: __do_kernel_fault(addr, esr, regs); return 0; } static int __kprobes do_translation_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { unsigned long addr = untagged_addr(far); if (is_ttbr0_addr(addr)) return do_page_fault(far, esr, regs); do_bad_area(far, esr, regs); return 0; } static int do_alignment_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) && compat_user_mode(regs)) return do_compat_alignment_fixup(far, regs); do_bad_area(far, esr, regs); return 0; } static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs) { return 1; /* "fault" */ } static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf; unsigned long siaddr; inf = esr_to_fault_info(esr); if (user_mode(regs) && apei_claim_sea(regs) == 0) { /* * APEI claimed this as a firmware-first notification. * Some processing deferred to task_work before ret_to_user(). */ return 0; } if (esr & ESR_ELx_FnV) { siaddr = 0; } else { /* * The architecture specifies that the tag bits of FAR_EL1 are * UNKNOWN for synchronous external aborts. Mask them out now * so that userspace doesn't see them. */ siaddr = untagged_addr(far); } arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; } static int do_tag_check_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { /* * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN * for tag check faults. Set them to corresponding bits in the untagged * address. */ far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK); do_bad_area(far, esr, regs); return 0; } static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 0 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, { do_sea, SIGKILL, SI_KERNEL, "level -1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented { do_bad, SIGKILL, SI_KERNEL, "unknown 25" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 26" }, { do_sea, SIGKILL, SI_KERNEL, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_bad, SIGKILL, SI_KERNEL, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 34" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 35" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 36" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 37" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, { do_bad, SIGKILL, SI_KERNEL, "level -1 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level -1 translation fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 44" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 45" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 46" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 47" }, { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" }, { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 50" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 51" }, { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" }, { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 54" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 55" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 56" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 57" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 58" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 59" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 60" }, { do_bad, SIGKILL, SI_KERNEL, "section domain fault" }, { do_bad, SIGKILL, SI_KERNEL, "page domain fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); unsigned long addr = untagged_addr(far); if (!inf->fn(far, esr, regs)) return; if (!user_mode(regs)) die_kernel_fault(inf->name, addr, esr, regs); /* * At this point we have an unrecognized fault type whose tag bits may * have been defined as UNKNOWN. Therefore we only expose the untagged * address to the signal handler. */ arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr); } NOKPROBE_SYMBOL(do_mem_abort); void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) { arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, addr, esr); } NOKPROBE_SYMBOL(do_sp_pc_abort); /* * __refdata because early_brk64 is __init, but the reference to it is * clobbered at arch_initcall time. * See traps.c and debug-monitors.c:debug_traps_init(). */ static struct fault_info __refdata debug_fault_info[] = { { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" }, { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" }, { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 3" }, { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" }, { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" }, { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 7" }, }; void __init hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned long, struct pt_regs *), int sig, int code, const char *name) { BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info)); debug_fault_info[nr].fn = fn; debug_fault_info[nr].sig = sig; debug_fault_info[nr].code = code; debug_fault_info[nr].name = name; } /* * In debug exception context, we explicitly disable preemption despite * having interrupts disabled. * This serves two purposes: it makes it much less likely that we would * accidentally schedule in exception context and it will force a warning * if we somehow manage to schedule by accident. */ static void debug_exception_enter(struct pt_regs *regs) { preempt_disable(); /* This code is a bit fragile. Test it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work"); } NOKPROBE_SYMBOL(debug_exception_enter); static void debug_exception_exit(struct pt_regs *regs) { preempt_enable_no_resched(); } NOKPROBE_SYMBOL(debug_exception_exit); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_debug_fault_info(esr); unsigned long pc = instruction_pointer(regs); debug_exception_enter(regs); if (user_mode(regs) && !is_ttbr0_addr(pc)) arm64_apply_bp_hardening(); if (inf->fn(addr_if_watchpoint, esr, regs)) { arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr); } debug_exception_exit(regs); } NOKPROBE_SYMBOL(do_debug_exception); /* * Used during anonymous page fault handling. */ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO; /* * If the page is mapped with PROT_MTE, initialise the tags at the * point of allocation and page zeroing as this is usually faster than * separate DC ZVA and STGM. */ if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; return vma_alloc_folio(flags, 0, vma, vaddr); } void tag_clear_highpage(struct page *page) { /* Newly allocated page, shouldn't have been tagged yet */ WARN_ON_ONCE(!try_page_mte_tagging(page)); mte_zero_clear_page_tags(page_address(page)); set_page_mte_tagged(page); }
3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to); static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(sbi, csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); dax_break_layout_final(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal && inode == journal->j_inode) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map) { unsigned int status; int retval; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(handle, inode, map, 0); else retval = ext4_ind_map_blocks(handle, inode, map, 0); if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, false); return retval; } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; unsigned int status; int err, retval = 0; /* * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE * indicates that the blocks and quotas has already been * checked when the data was copied into the page cache. */ if (map->m_flags & EXT4_MAP_DELAYED) flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * We need to check for EXT4 here because migrate could have * changed the inode type in between. */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); /* * We allocated new blocks which will result in i_data's * format changing. Force the migrate to fail by clearing * migrate flags. */ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode %lu: " "retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (err) return err; } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) return retval; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); return retval; } /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are * pre-allocated and unwritten, the resulting @map is marked as unwritten. * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; map->m_flags |= ext4_es_is_delayed(&es) ? EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif goto found; } /* * In the query cache no-wait mode, nothing we can do more if we * cannot find extent in the cache. */ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return 0; /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); retval = ext4_map_query_blocks(handle, inode, map); up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = (loff_t)map->m_lblk << inode->i_blkbits; loff_t length = (loff_t)map->m_len << inode->i_blkbits; if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || map->m_flags & EXT4_MAP_MAPPED)) ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ old_state = READ_ONCE(bh->b_state); do { new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); /* * If the buffer is marked unwritten, mark it as new to make sure it is * zeroed out correctly in case of partial writes. Otherwise, there is * a chance of stale data getting exposed. */ if (ret == 0 && buffer_unwritten(bh_result)) set_buffer_new(bh_result); return ret; } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); /* * Since bh could introduce extra ref count such as referred by * journal_head etc. Try to avoid using __GFP_MOVABLE here * as it may fail the migration when journal_head remains. */ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, inode->i_sb->s_blocksize); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * Helper for handling dirtying of journalled data. We also mark the folio as * dirty so that writeback code knows about this page (and inode) contains * dirty data. ext4_writepages() then commits appropriate transaction to * make data stable. */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { folio_mark_dirty(bh->b_folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; BUFFER_TRACE(bh, "get write access"); return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); } int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize = inode->i_sb->s_blocksize; unsigned bbits; struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { /* * We may be zeroing partial buffers or all new * buffers in case of failure. Prepare JBD2 for * that. */ if (should_journal_data) do_journal_get_write_access(handle, inode, bh); if (folio_test_uptodate(folio)) { /* * Unlike __block_write_begin() we leave * dirtying of new uptodate buffers to * ->write_end() time or * folio_zero_new_buffers(). */ set_buffer_uptodate(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { if (should_journal_data) ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); else folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct folio *folio; pgoff_t index; unsigned from, to; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; from = pos & (PAGE_SIZE - 1); to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, foliop); if (ret < 0) return ret; if (ret == 1) return 0; } /* * __filemap_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { folio_put(folio); return PTR_ERR(handle); } folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); folio_unlock(folio); /* * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; folio_put(folio); return ret; } *foliop = folio; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; folio_zero_range(folio, start, size); } clear_buffer_new(bh); write_end_fn(handle, inode, bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for 'nr_resv' clusters */ static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } ei->i_reserved_data_blocks += nr_resv; trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ /* * Extent to map - this can be after first_page because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { unsigned nr, i; pgoff_t index, end; struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_page == 0. */ if (mpd->first_page >= mpd->next_page) return; mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; start = index << (PAGE_SHIFT - inode->i_blkbits); last = end << (PAGE_SHIFT - inode->i_blkbits); /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start + 1); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); while (index <= end) { nr = filemap_get_folios(mapping, &index, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; if (folio->index < mpd->first_page) continue; if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { if (folio_mapped(folio)) folio_clear_dirty_for_io(folio); block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); } folio_unlock(folio); } folio_batch_release(&fbatch); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } /* * Check whether the cluster containing lblk has been allocated or has * delalloc reservation. * * Returns 0 if the cluster doesn't have either, 1 if it has delalloc * reservation, 2 if it's already been allocated, negative error code on * failure. */ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; /* Has delalloc reservation? */ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) return 1; /* Already been allocated? */ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) return 2; ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) return ret; if (ret > 0) return 2; return 0; } /* * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents * status tree, incrementing the reserved * cluster/block count or making pending * reservations where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool lclu_allocated = false; bool end_allocated = false; ext4_lblk_t resv_clu; ext4_lblk_t end = lblk + len - 1; /* * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; ret = ext4_clu_alloc_state(inode, lblk); if (ret < 0) return ret; if (ret > 0) { resv_clu--; lclu_allocated = (ret == 2); } if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { ret = ext4_clu_alloc_state(inode, end); if (ret < 0) return ret; if (ret > 0) { resv_clu--; end_allocated = (ret == 2); } } if (resv_clu) { ret = ext4_da_reserve_space(inode, resv_clu); if (ret != 0) /* ENOSPC */ return ret; } } ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, end_allocated); return 0; } /* * Looks up the requested blocks and sets the delalloc extent map. * First try to look up for the extent entry that contains the requested * blocks in the extent status tree without i_data_sem, then try to look * up for the ondisk extent mapping with i_data_sem in read mode, * finally hold i_data_sem in write mode, looks up again and add a * delalloc extent entry if it still couldn't find any extent. Pass out * the mapped extent through @map and return 0 on success. */ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (ext4_es_is_hole(&es)) goto add_delayed; found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es)) { map->m_flags |= EXT4_MAP_DELAYED; return 0; } map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return 0; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); /* * Page fault path (ext4_page_mkwrite does not take i_rwsem) * and fallocate path (no folio lock) can race. Make sure we * lookup the extent status tree here again while i_data_sem * is held in write mode, before inserting a new da entry in * the extent status tree. */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; } } else if (!ext4_has_inline_data(inode)) { retval = ext4_map_query_blocks(NULL, inode, map); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; } } map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, &map); if (ret < 0) return ret; if (map.m_flags & EXT4_MAP_DELAYED) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->first_page += folio_nr_pages(folio); folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { size_t len; loff_t size; int err; BUG_ON(folio->index != mpd->first_page); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_folio() to zero-out tail of the written page. We rely * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_folio - update folio buffers corresponding to changed extent * and may submit fully mapped page for IO * @mpd: description of extent to map, on return next extent to map * @folio: Contains these buffers. * @m_lblk: logical block mapping. * @m_pblk: corresponding physical mapping. * @map_bh: determines on return whether this page requires any further * mapping or not. * * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given folio is not fully mapped, we update @mpd to the next extent in * the given folio that needs mapping & return @map_bh as true. */ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); while (start <= end) { nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: folio_batch_release(&fbatch); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { if (progress) goto update_disksize; return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { struct buffer_head *page_bufs = folio_buffers(folio); struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; return ret; } static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); size_t len = folio_size(folio); folio_clear_checked(folio); mpd->wbc->nr_to_write--; if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. If we cannot map blocks, we submit just already mapped * buffers in the page for IO and keep page dirty. When we can map blocks and * we find a page which isn't mapped we start accumulating extent of buffers * underlying these pages that needs mapping (formed by either delayed or * unwritten buffers). We also lock the pages containing these buffers. The * extent found is returned in @mpd structure (starting at mpd->lblk with * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; mpd->next_page = index; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; if (handle) { err = ext4_journal_ensure_credits(handle, bpp, 0); if (err < 0) goto out; } folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!folio_test_dirty(folio) || (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } folio_wait_writeback(folio); BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!folio_buffers(folio)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); folio_clear_dirty(folio); folio_unlock(folio); continue; } if (mpd->map.m_len == 0) mpd->first_page = folio->index; mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This * makes sure current data is checkpointed to the final * location before possibly journalling it again which * is desirable when the page is frequently dirtied * through a pin. */ if (!mpd->can_map) { err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; } } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; if (handle) ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); if (handle) ext4_journal_stop(handle); return err; } static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ ret = ext4_emergency_state(mapping->host->i_sb); if (unlikely(ret)) goto out_writepages; /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } /* * data=journal mode does not do delalloc so we just need to writeout / * journal already mapped buffers. On the other hand we need to commit * transaction to make data stable. We expect all the data to be * already in the journal (the only exception are DMA pinned pages * dirtied behind our back) so we commit transaction here and run the * writeback loop to checkpoint them. The checkpointing is not actually * necessary to make data persistent *but* quite a few places (extent * shifting operations, fsverity, ...) depend on being able to drop * pagecache pages after calling filemap_write_and_wait() and for that * checkpointing needs to happen. */ if (ext4_should_journal_data(inode)) { mpd->can_map = 0; if (wbc->sync_mode == WB_SYNC_ALL) ext4_fc_commit(sbi->s_journal, EXT4_I(inode)->i_datasync_tid); } mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, PAGE_SIZE >> inode->i_blkbits); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd->first_page = writeback_index; mpd->last_page = -1; } else { mpd->first_page = wbc->range_start >> PAGE_SHIFT; mpd->last_page = wbc->range_end >> PAGE_SHIFT; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd->first_page, mpd->last_page); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd->do_map = 0; mpd->scanned_until_end = 0; mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); ext4_put_io_end_defer(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; break; } mpd->do_map = 1; trace_ext4_da_write_pages(inode, mpd->first_page, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd->do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd->last_page = writeback_index - 1; mpd->first_page = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; int ret; int alloc_ctx; ret = ext4_emergency_state(sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked * for delayed dirtying (PageChecked) which were just added to the * running transaction. Try once more to get them to stable storage. */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); ext4_writepages_up_read(sb, alloc_ctx); return ret; } int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; return ext4_do_writepages(&mpd); } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; int alloc_ctx; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_generic_write_inline_data(mapping, inode, pos, len, foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) return 0; } retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *foliop = folio; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; struct inode *inode = folio->mapping->host; unsigned int idx; int i; bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; loff_t new_i_size, zero_len = 0; handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); folio_put(folio); return -EIO; } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, folio, NULL); new_i_size = pos + copied; /* * It's important to update i_size while still holding folio lock, * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range up to i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize up to i_size * eventually. If the end of the current write is > i_size and * inside an allocated block which ext4_da_should_update_i_disksize() * checked, we need to update i_disksize here as certain * ext4_writepages() paths not allocating blocks and update i_disksize. */ if (new_i_size > inode->i_size) { unsigned long end; i_size_write(inode, new_i_size); end = (new_i_size - 1) & (PAGE_SIZE - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } folio_unlock(folio); folio_put(folio); if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); zero_len = pos - old_size; } if (!disksize_changed && !zero_len) return copied; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (zero_len) ext4_zero_partial_blocks(handle, inode, old_size, zero_len); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return copied; } static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t ret = 0; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && (test_opt(inode->i_sb, DELALLOC) || ext4_should_journal_data(inode))) { /* * With delalloc or journalled data we want to sync the file so * that we can make sure we allocate blocks for file and data * is in place for the user to see it */ filemap_write_and_wait(mapping); } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) { int ret = -EAGAIN; struct inode *inode = folio->mapping->host; trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, folio); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); block_invalidate_folio(folio, offset, length); } static int __ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { journal_t *journal = EXT4_JOURNAL(folio->mapping->host); trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == folio_size(folio)) folio_clear_checked(folio); return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static bool ext4_release_folio(struct folio *folio, gfp_t wait) { struct inode *inode = folio->mapping->host; journal_t *journal = EXT4_JOURNAL(inode); trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) return false; if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; /* HW-offload atomics are always used */ if (flags & IOMAP_ATOMIC) iomap->flags |= IOMAP_F_ATOMIC_BIO; if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_DELAYED) { iomap->type = IOMAP_DELALLOC; iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) goto out; } ret = ext4_iomap_alloc(inode, &map, flags); } else { ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) { /* must be a directio to fall back to buffered */ if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != (IOMAP_WRITE | IOMAP_DIRECT)) return false; /* atomic writes are all-or-nothing */ if (flags & IOMAP_ATOMIC) return false; /* can only try again if we wrote nothing */ return written == 0; } static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code for * non-atomic write so that we fallback to buffered I/O and attempt to * complete the remainder of the I/O. * For non-atomic writes, any blocks that may have been * allocated in preparation for the direct I/O will be reused during * buffered I/O. For atomic write, we never fallback to buffered-io. */ if (ext4_want_directio_fallback(flags, written)) return -ENOTBLK; return 0; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, .iomap_end = ext4_iomap_end, }; static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the * transaction and marked as jbddirty (we take care of this in * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings * so we should have nothing to do here, except for the case when someone * had the page pinned and dirtied the page through this pin (e.g. by doing * direct IO to it). In that case we'd need to attach buffers here to the * transaction but we cannot due to lock ordering. We cannot just dirty the * folio and leave attached buffers clean, because the buffers' dirty state is * "definitive". We cannot just set the buffers dirty or jbddirty because all * the journalling code will explode. So what we do is to mark the folio * "pending dirty" and next time ext4_writepages() is called, attach buffers * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); if (folio_maybe_dma_pinned(folio)) folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); WARN_ON_ONCE(!folio_buffers(folio)); return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } /* * Here we can't skip an unwritten buffer even though it usually reads zero * because it might have data in pagecache (eg, if called from ext4_zero_range, * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct folio *folio; int err = 0; folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: folio_unlock(folio); folio_put(folio); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen after we truncate page cache. */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static inline void ext4_truncate_folio(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); struct folio *folio; /* Nothing to be done if no complete block needs to be truncated. */ if (round_up(start, blocksize) >= round_down(end, blocksize)) return; folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); if (IS_ERR(folio)) return; if (folio_mkclean(folio)) folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } int ext4_truncate_page_cache_block_range(struct inode *inode, loff_t start, loff_t end) { unsigned long blocksize = i_blocksize(inode); int ret; /* * For journalled data we need to write (and checkpoint) pages * before discarding page cache to avoid inconsitent data on disk * in case of crash before freeing or unwritten converting trans * is committed. */ if (ext4_should_journal_data(inode)) { ret = filemap_write_and_wait_range(inode->i_mapping, start, end - 1); if (ret) return ret; goto truncate_pagecache; } /* * If the block size is less than the page size, the file's mapped * blocks within one page could be freed or converted to unwritten. * So it's necessary to remove writable userspace mappings, and then * ext4_page_mkwrite() can be called during subsequent write access * to these partial folios. */ if (!IS_ALIGNED(start | end, PAGE_SIZE) && blocksize < PAGE_SIZE && start < inode->i_size) { loff_t page_boundary = round_up(start, PAGE_SIZE); ext4_truncate_folio(inode, start, min(page_boundary, end)); if (end > page_boundary) ext4_truncate_folio(inode, round_down(end, PAGE_SIZE), end); } truncate_pagecache: truncate_pagecache_range(inode, start, end - 1); return 0; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; loff_t end = offset + length; handle_t *handle; unsigned int credits; int ret; trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) return 0; /* * If the hole extends beyond i_size, set the hole to end after * the page that contains i_size, and also make sure that the hole * within one block before last range. */ if (end > inode->i_size) end = round_up(inode->i_size, PAGE_SIZE); if (end > max_end) end = max_end; length = end - offset; /* * Attach jinode to inode for jbd2 if we do any zeroing of partial * block. */ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) return ret; /* Now release the pages and zero block aligned part of pages*/ ret = ext4_truncate_page_cache_block_range(inode, offset, end); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); return ret; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_handle; /* If there are blocks to remove, do it */ start_lblk = EXT4_B_TO_LBLK(inode, offset); end_lblk = end >> inode->i_blkbits; if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); else ret = ext4_ind_remove_space(handle, inode, start_lblk, end_lblk); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_handle; } ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, start_lblk, end_lblk); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { struct ext4_inode_info *ei = EXT4_I(inode); uid_t i_uid; gid_t i_gid; projid_t i_projid; int block; int err; err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ext4_isize_set(raw_inode, ei->i_disksize); raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If we pass 'inode' and it does not * have in-inode xattr, we have all inode data in memory that is needed * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { struct ext4_inode *raw_inode = (struct ext4_inode *) (bh->b_data + iloc->offset); /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; err = xattr_check_inode(inode, IHDR(inode, raw_inode), ITAIL(inode, raw_inode)); if (err) return err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, const char *function, unsigned int line) { const char *err_str; if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { err_str = "missing EA_INODE flag"; goto error; } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) { err_str = "ea_inode with extended attributes"; goto error; } } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { /* * open_by_handle_at() could provide an old inode number * that has since been reused for an ea_inode; this does * not indicate filesystem corruption */ if (flags & EXT4_IGET_HANDLE) return -ESTALE; err_str = "unexpected EA_INODE flag"; goto error; } } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; goto error; } return 0; error: ext4_error_inode(inode, function, line, 0, err_str); return -EFSCORRUPTED; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { ret = check_igot_inode(inode, flags, function, line); if (ret) { iput(inode); return ERR_PTR(ret); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_ATIME(inode, raw_inode); EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; if (inode->i_size == 0 || inode->i_size >= sizeof(ei->i_data) || strnlen((char *)ei->i_data, inode->i_size + 1) != inode->i_size) { ext4_error_inode(inode, function, line, 0, "invalid fast symlink length %llu", (unsigned long long)inode->i_size); ret = -EFSCORRUPTED; goto bad_inode; } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { inode->i_op = &ext4_symlink_inode_operations; } } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, * it's just not an inode we can return for an fhandle lookup. */ if (ret == -ESTALE) { brelse(iloc.bh); unlock_new_inode(inode); iput(inode); return ERR_PTR(-ESTALE); } if (ret) goto bad_inode; brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err; int need_datasync = 0, set_large_file = 0; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid; int ret; bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); folio_unlock(folio); folio_put(folio); if (ret != -EBUSY) return; has_transaction = false; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; has_transaction = true; } read_unlock(&journal->j_state_lock); if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_rwsem down. */ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; error = ext4_emergency_state(inode->i_sb); if (unlikely(error)) return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(idmap, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { ext4_journal_stop(handle); return error; } /* Update corresponding info in inode so that everything is in * one transaction */ i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { return error; } } if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { return -EINVAL; } if (attr->ia_size == inode->i_size) inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, attr->ia_size); if (error) goto err_out; } /* * Blocks are going to be removed from the inode. Wait * for dio in flight. */ inode_dio_wait(inode); } filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { filemap_invalidate_unlock(inode->i_mapping); goto err_out; } if (attr->ia_size != inode->i_size) { /* attach jbd2 jinode for EOF folio tail zeroing */ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_mmap_sem; } if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } /* * Update c/mtime and tail zero the EOF folio on * truncate up. ext4_truncate() handles the shrink case * below. */ if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (oldsize & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, inode->i_mapping, oldsize); } if (shrink) ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) error = rc; /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code * running ext4_wb_update_i_disksize(). */ if (!error) i_size_write(inode, attr->ia_size); else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) goto out_mmap_sem; if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); } else if (ext4_should_journal_data(inode)) { ext4_wait_for_tail_page_commit(inode); } } /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); /* * Call ext4_truncate() even if i_size didn't change to * truncate possible preallocated blocks. */ if (attr->ia_size <= oldsize) { rc = ext4_truncate(inode); if (rc) error = rc; } out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); } if (!error) { if (inc_ivers) inode_inc_iversion(inode); setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } /* * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) ext4_std_error(inode->i_sb, error); if (!error) error = rc; return error; } u32 ext4_dio_alignment(struct inode *inode) { if (fsverity_active(inode)) return 0; if (ext4_should_journal_data(inode)) return 0; if (ext4_has_inline_data(inode)) return 0; if (IS_ENCRYPTED(inode)) { if (!fscrypt_dio_supported(inode)) return 0; return i_blocksize(inode); } return 1; /* use the iomap defaults */ } int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; if ((request_mask & STATX_BTIME) && EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } /* * Return the DIO alignment restrictions if requested. We only return * this information when requested, since on encrypted files it might * take a fair bit of work to get if the file wasn't opened recently. */ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { u32 dio_align = ext4_dio_alignment(inode); stat->result_mask |= STATX_DIOALIGN; if (dio_align == 1) { struct block_device *bdev = inode->i_sb->s_bdev; /* iomap defaults */ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } else { stat->dio_mem_align = dio_align; stat->dio_offset_align = dio_align; } } if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int awu_min = 0, awu_max = 0; if (ext4_inode_can_atomic_write(inode)) { awu_min = sbi->s_awu_min; awu_max = sbi->s_awu_max; } generic_fill_statx_atomic_writes(stat, awu_min, awu_max); } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & EXT4_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & EXT4_ENCRYPT_FL) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & EXT4_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; if (flags & EXT4_VERITY_FL) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | STATX_ATTR_VERITY); generic_fillattr(idmap, request_mask, inode, stat); return 0; } int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). * Report at least one sector for such files, so tools like tar, rsync, * others don't incorrectly think the file is completely sparse. */ if (unlikely(ext4_has_inline_data(inode))) stat->blocks += (stat->size + 511) >> 9; /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with * on-disk file blocks. * We always keep i_blocks updated together with real * allocation. But to not confuse with user, stat * will return the blocks that include the delayed allocation * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } static int ext4_index_trans_blocks(struct inode *inode, int lblocks, int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_ind_trans_blocks(inode, lblocks); return ext4_ext_index_trans_blocks(inode, pextents); } /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks */ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; int ret; /* * How many index blocks need to touch to map @lblocks logical blocks * to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; /* * Now let's see how many group bitmaps and group descriptors need * to account */ groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ ret += groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } /* * Calculate the total number of credits to reserve to fit * the modification of a single pages into a single transaction, * which may include multiple chunks of block allocations. * * This could be called via ext4_write_begin() * * We need to consider the worse case, when * one new block per extent. */ int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) ret += bpp; return ret; } /* * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. */ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) { return ext4_meta_trans_blocks(inode, nrblocks, 1); } /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err = 0; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) { put_bh(iloc->bh); return err; } ext4_fc_track_inode(handle, inode); /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh); return err; } /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err; err = ext4_emergency_state(inode->i_sb); if (unlikely(err)) return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; } } ext4_std_error(inode->i_sb, err); return err; } static int __ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc, handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int error; /* this was checked at iget time, but double check for good measure */ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || (ei->i_extra_isize & 3)) { EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); return -EFSCORRUPTED; } if ((new_extra_isize < ei->i_extra_isize) || (new_extra_isize < 4) || (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) return -EINVAL; /* Should never happen */ raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize, 0, new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } /* * We may need to allocate external xattr block so we need quotas * initialized. Here we can be called with various locks held so we * cannot affort to initialize quotas ourselves. So just bail. */ if (dquot_initialize_needed(inode)) return -EAGAIN; /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); if (error) { /* * Inode size expansion failed; don't try again */ *no_expand = 1; } return error; } /* * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. */ static int ext4_try_to_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc iloc, handle_t *handle) { int no_expand; int error; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) return -EOVERFLOW; /* * In nojournal mode, we can immediately attempt to expand * the inode. When journaled, we first need to obtain extra * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ if (ext4_journal_extend(handle, EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) return -EBUSY; error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, handle, &no_expand); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc) { handle_t *handle; int no_expand; int error, rc; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { brelse(iloc->bh); return -EOVERFLOW; } handle = ext4_journal_start(inode, EXT4_HT_INODE, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); brelse(iloc->bh); return error; } ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, handle, &no_expand); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!error) error = rc; out_unlock: ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); return error; } /* * What we do here is to mark the in-core inode as clean with respect to inode * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which * have a transaction open against a different journal. * * Is this cheating? Not really. Sure, we haven't written the * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); err = ext4_mark_iloc_dirty(handle, inode, &iloc); out: if (unlikely(err)) ext4_error_inode_err(inode, func, line, 0, err, "mark_inode_dirty error"); return err; } /* * ext4_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return; ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; handle_t *handle; int err; int alloc_ctx; /* * We have to be very careful here: changing a data block's * journaling status dynamically is dangerous. If we write a * data block to the journal, change the status and then delete * that block, we risk forgetting to revoke the old log record * from the journal and so a subsequent replay can corrupt data. * So, first we make sure that the journal is empty and that * nobody is changing anything. */ journal = EXT4_JOURNAL(inode); if (!journal) return 0; if (is_journal_aborted(journal)) return -EROFS; /* Wait for all existing dio workers */ inode_dio_wait(inode); /* * Before flushing the journal and switching inode's aops, we have * to flush all dirty data the inode has. There can be outstanding * delayed allocations, there can be unwritten extents created by * fallocate or buffered writes in dioread_nolock mode covered by * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ if (val) { filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { filemap_invalidate_unlock(inode->i_mapping); return err; } } alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* * OK, there are no updates running now, and all cached data is * synced to disk. We are now in a completely consistent state * which doesn't have anything in the journal, and we know that * no filesystem updates are running, so it is safe to modify * the inode's in-core data-journaling state flag now. */ if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); return err; } static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return !buffer_mapped(bh); } vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; handle_t *handle; get_block_t *get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) goto out_ret; /* * On data journalling we skip straight to the transaction handle: * there's no delalloc; page truncated will be checked later; the * early return w/ all buffers mapped (calculates size/len) can't * be used; and there's no dioread_nolock, so only ext4_get_block. */ if (ext4_should_journal_data(inode)) goto retry_alloc; /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) { do { err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time * * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ if (folio_buffers(folio)) { if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; } /* * Data journalling can't use block_page_mkwrite() because it * will set_buffer_dirty() before do_journal_get_write_access() * thus might hit warning messages for dirty metadata buffers. */ if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); err = ext4_block_write_begin(handle, folio, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) goto out_error; } else { folio_unlock(folio); } } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: folio_unlock(folio); ext4_journal_stop(handle); goto out; }
364 230 267 268 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ #include <linux/cred.h> #include <linux/fs.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include "internal.h" /* * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, * never from raw values. These are just internal helpers. */ #define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } #define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } struct mnt_idmap { struct uid_gid_map uid_map; struct uid_gid_map gid_map; refcount_t count; }; /* * Carries the initial idmapping of 0:0:4294967295 which is an identity * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /* * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range. * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID. */ struct mnt_idmap invalid_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(invalid_mnt_idmap); /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, * [...], 1000 to 1000 [...]. * * Return: true if this is the initial mapping, false if not. */ static inline bool initial_idmapping(const struct user_namespace *ns) { return ns == &init_user_ns; } /** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kuid : kuid to be mapped * * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() * directly. * * Return: @kuid mapped according to @idmap. * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is * returned. */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kuid_t kuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSUID; if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); else uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); /** * make_vfsgid - map a filesystem kgid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kgid : kgid to be mapped * * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() * directly. * * Return: @kgid mapped according to @idmap. * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is * returned. */ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSGID; if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); else gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); /** * from_vfsuid - map a vfsuid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid : vfsuid to be mapped * * Map @vfsuid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsuid to inode->i_uid. * * Return: @vfsuid mapped into the filesystem idmapping */ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); if (idmap == &invalid_mnt_idmap) return INVALID_UID; uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) return KUIDT_INIT(uid); return make_kuid(fs_userns, uid); } EXPORT_SYMBOL_GPL(from_vfsuid); /** * from_vfsgid - map a vfsgid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid : vfsgid to be mapped * * Map @vfsgid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsgid to inode->i_gid. * * Return: @vfsgid mapped into the filesystem idmapping */ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); if (idmap == &invalid_mnt_idmap) return INVALID_GID; gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) return KGIDT_INIT(gid); return make_kgid(fs_userns, gid); } EXPORT_SYMBOL_GPL(from_vfsgid); #ifdef CONFIG_MULTIUSER /** * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups * @vfsgid: the mnt gid to match * * This function can be used to determine whether @vfsuid matches any of the * caller's groups. * * Return: 1 if vfsuid matches caller's groups, 0 if not. */ int vfsgid_in_group_p(vfsgid_t vfsgid) { return in_group_p(AS_KGIDT(vfsgid)); } #else int vfsgid_in_group_p(vfsgid_t vfsgid) { return 1; } #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); static int copy_mnt_idmap(struct uid_gid_map *map_from, struct uid_gid_map *map_to) { struct uid_gid_extent *forward, *reverse; u32 nr_extents = READ_ONCE(map_from->nr_extents); /* Pairs with smp_wmb() when writing the idmapping. */ smp_rmb(); /* * Don't blindly copy @map_to into @map_from if nr_extents is * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we * read @nr_extents someone could have written an idmapping and * then we might end up with inconsistent data. So just don't do * anything at all. */ if (nr_extents == 0) return -EINVAL; /* * Here we know that nr_extents is greater than zero which means * a map has been written. Since idmappings can't be changed * once they have been written we know that we can safely copy * from @map_to into @map_from. */ if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { *map_to = *map_from; return 0; } forward = kmemdup_array(map_from->forward, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; reverse = kmemdup_array(map_from->reverse, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; } /* * The idmapping isn't exposed anywhere so we don't need to care * about ordering between extent pointers and @nr_extents * initialization. */ map_to->forward = forward; map_to->reverse = reverse; map_to->nr_extents = nr_extents; return 0; } static void free_mnt_idmap(struct mnt_idmap *idmap) { if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->uid_map.forward); kfree(idmap->uid_map.reverse); } if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->gid_map.forward); kfree(idmap->gid_map.reverse); } kfree(idmap); } struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); refcount_set(&idmap->count, 1); ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); if (!ret) ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); if (ret) { free_mnt_idmap(idmap); idmap = ERR_PTR(ret); } return idmap; } /** * mnt_idmap_get - get a reference to an idmapping * @idmap: the idmap to bump the reference on * * If @idmap is not the @nop_mnt_idmap bump the reference count. * * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. */ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap) refcount_inc(&idmap->count); return idmap; } EXPORT_SYMBOL_GPL(mnt_idmap_get); /** * mnt_idmap_put - put a reference to an idmapping * @idmap: the idmap to put the reference on * * If this is a non-initial idmapping, put the reference count when a mount is * released and free it if we're the last user. */ void mnt_idmap_put(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap && refcount_dec_and_test(&idmap->count)) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) { struct uid_gid_map *map, *map_up; u32 idx, nr_mappings; if (!is_valid_mnt_idmap(idmap)) return 0; /* * Idmappings are shown relative to the caller's idmapping. * This is both the most intuitive and most useful solution. */ if (uid_map) { map = &idmap->uid_map; map_up = &current_user_ns()->uid_map; } else { map = &idmap->gid_map; map_up = &current_user_ns()->gid_map; } for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { uid_t lower; struct uid_gid_extent *extent; if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = &map->extent[idx]; else extent = &map->forward[idx]; /* * Verify that the whole range of the mapping can be * resolved in the caller's idmapping. If it cannot be * resolved skip the mapping. */ lower = map_id_range_up(map_up, extent->lower_first, extent->count); if (lower == (uid_t) -1) continue; seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); seq->count++; /* mappings are separated by \0 */ if (seq_has_overflowed(seq)) return -EAGAIN; nr_mappings++; } return nr_mappings; }
518 525 90 90 123 90 76 139 36 139 172 171 172 172 349 245 168 349 227 172 172 172 350 349 88 311 89 344 3 134 40 95 20 350 161 160 161 106 108 1 2 2 210 210 150 414 83 83 394 311 302 11 310 311 153 63 37 37 63 63 123 37 146 154 18 18 154 153 154 210 156 165 104 164 11 156 37 37 210 210 209 210 210 37 210 288 288 158 210 72 95 288 288 348 348 348 348 348 348 348 348 348 67 27 348 350 350 350 350 349 350 4 4 4 4 4 4 329 330 330 328 330 330 330 329 328 330 329 330 329 330 329 328 330 329 328 330 329 330 329 329 330 330 330 329 329 1 375 279 19 19 19 19 14 5 5 173 16 29 282 65 34 65 34 36 1 36 21 36 1 36 1 36 36 36 36 1 36 18 36 36 19 36 1 1 36 36 36 1 36 1 1 36 1 36 5 5 6 11 5 6 375 279 285 374 178 284 39 285 284 285 375 373 11 193 183 14 12 183 96 94 273 274 258 19 7 274 285 445 156 182 167 26 176 9 147 52 52 182 183 96 92 183 183 181 183 149 48 182 182 87 87 182 182 173 7 167 167 1 2 10 10 3 7 7 12 2 19 18 1 19 260 10 21 275 8 167 474 472 529 42 42 21 36 26 26 529 529 525 517 49 527 502 166 529 529 557 555 58 312 3 345 558 411 329 558 555 555 555 497 172 558 49 529 497 172 173 557 344 26 14 14 3 14 341 344 242 142 14 593 594 582 582 592 360 582 348 348 346 348 177 177 14 173 173 39 57 56 57 57 1283 1283 62 9 356 324 46 357 323 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * demand-loading started 01.12.91 - seems it is high on the list of * things wanted, and it should be easy to implement. - Linus */ /* * Ok, demand-loading was easy, shared pages a little bit tricker. Shared * pages started 02.12.91, seems to work. - Linus. * * Tested sharing by executing about 30 /bin/sh: under the old kernel it * would have taken more than the 6M I have free, but it worked well as * far as I could see. * * Also corrected some "invalidate()"s - I wasn't doing enough of them. */ /* * Real VM (paging to/from disk) started 18.12.91. Much more work and * thought has to go into this. Oh, well.. * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. * Found it. Everything seems to work now. * 20.12.91 - Ok, making the swap-device changeable like the root. */ /* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) * * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> #include <linux/kmsan.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> #include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "pgalloc-track.h" #include "internal.h" #include "swap.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); /* * Return true if the original pte was a uffd-wp pte marker (so the pte was * wr-protected). */ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) { if (!userfaultfd_wp(vmf->vma)) return false; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; return pte_marker_uffd_wp(vmf->orig_pte); } /* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */ int randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else 2; #endif #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { /* * Transitioning a PTE from 'old' to 'young' can be expensive on * some architectures, even if it's performed in hardware. By * default, "false" means prefaulted entries will be 'young'. */ return false; } #endif static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; } __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member) { trace_rss_stat(mm, member); } /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= P4D_MASK; if (start < floor) return; if (ceiling) { ceiling &= P4D_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { p4d_t *p4d; unsigned long next; unsigned long start; start = addr; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; free_pud_range(tlb, p4d, addr, next, floor, ceiling); } while (p4d++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; p4d = p4d_offset(pgd, start); pgd_clear(pgd); p4d_free_tlb(tlb, p4d, start); } /* * This function frees user-level page tables of a process. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; /* * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { struct unlink_vma_file_batch vb; do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; /* * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); if (is_vm_hugetlb_page(vma)) { unlink_file_vma(vma); hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } else { unlink_file_vma_batch_init(&vb); unlink_file_vma_batch_add(&vb, vma); /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = mas_find(mas, ceiling - 1); if (unlikely(xa_is_zero(next))) next = NULL; if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); unlink_file_vma_batch_add(&vb, vma); } unlink_file_vma_batch_final(&vb); free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } vma = next; } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) { spinlock_t *ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ pmd_populate(mm, pmd, *pte); *pte = NULL; } spin_unlock(ptl); } int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); return 0; } int __pte_alloc_kernel(pmd_t *pmd) { pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ smp_wmb(); /* See comment in pmd_install() */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; } static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); } static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); } /* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * * And for normal mappings this is false. * * This restricts such mappings to be a linear translation from virtual address * to pfn. To get around this restriction, we allow arbitrary mappings so long * as the vma is not a COW mapping; in that case, we know that all ptes are * special (because none can have been COWed). * * * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The only exception are zeropages, which are * *never* refcounted. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long pfn = pte_pfn(pte); if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn)) return NULL; if (pte_devmap(pte)) /* * NOTE: New users of ZONE_DEVICE will not set pte_devmap() * and will have refcounts incremented on their struct pages * when they are inserted into PTEs, thus they are safe to * return here. Legacy ZONE_DEVICE pages that set pte_devmap() * do not have refcounts. Example of legacy ZONE_DEVICE is * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. */ return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; if (is_zero_pfn(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn)) return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: VM_WARN_ON_ONCE(is_zero_pfn(pfn)); return pfn_to_page(pfn); } struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct page *page = vm_normal_page(vma, addr, pte); if (page) return page_folio(page); return NULL; } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); /* Currently it's only used for huge pfnmaps */ if (unlikely(pmd_special(pmd))) return NULL; if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (pmd_devmap(pmd)) return NULL; if (is_huge_zero_pmd(pmd)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { struct page *page = vm_normal_page_pmd(vma, addr, pmd); if (page) return page_folio(page); return NULL; } #endif /** * restore_exclusive_pte - Restore a device-exclusive entry * @vma: VMA covering @address * @folio: the mapped folio * @page: the mapped folio page * @address: the virtual address * @ptep: pte pointer into the locked page table mapping the folio page * @orig_pte: pte value at @ptep * * Restore a device-exclusive non-swap entry to an ordinary present pte. * * The folio and the page table must be locked, and MMU notifiers must have * been called to invalidate any (exclusive) device mappings. * * Locking the folio makes sure that anybody who just converted the pte to * a device-exclusive entry can map it into the device to make forward * progress without others converting it back until the folio was unlocked. * * If the folio lock ever becomes an issue, we can stop relying on the folio * lock; it might make some scenarios with heavy thrashing less likely to * make forward progress, but these scenarios might not be valid use cases. * * Note that the folio lock does not protect against all cases of concurrent * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers * must use MMU notifiers to sync against any concurrent changes. */ static void restore_exclusive_pte(struct vm_area_struct *vma, struct folio *folio, struct page *page, unsigned long address, pte_t *ptep, pte_t orig_pte) { pte_t pte; VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); if ((vma->vm_flags & VM_WRITE) && can_change_pte_writable(vma, address, pte)) { if (folio_test_dirty(folio)) pte = pte_mkdirty(pte); pte = pte_mkwrite(pte, vma); } set_pte_at(vma->vm_mm, address, ptep, pte); /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. */ update_mmu_cache(vma, address, ptep); } /* * Tries to restore an exclusive pte if the page lock can be acquired without * sleeping. */ static int try_restore_exclusive_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t orig_pte) { struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte)); struct folio *folio = page_folio(page); if (folio_trylock(folio)) { restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte); folio_unlock(folio); return 0; } return -EBUSY; } /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. */ static unsigned long copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; swp_entry_t entry = pte_to_swp_entry(orig_pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } /* Mark the swap entry as shared. */ if (pte_swp_exclusive(orig_pte)) { pte = pte_swp_clear_exclusive(orig_pte); set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { folio = pfn_swap_entry_folio(entry); rss[mm_counter(folio)]++; if (!is_readable_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child * to be set to read. A previously exclusive entry is * now shared. */ entry = make_readable_migration_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = pfn_swap_entry_to_page(entry); folio = page_folio(page); /* * Update rss count even for unaddressable pages, as * they should treated just like normal pages in this * respect. * * We will likely want to have some new rss counters * for unaddressable pages, at some point. But for now * keep things as they are. */ folio_get(folio); rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma); /* * We do not preserve soft-dirty information, because so * far, checkpoint/restore is the only feature that * requires that. And checkpoint/restore does not work * when a device driver is involved (you cannot easily * save and restore device driver state). */ if (is_writable_device_private_entry(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_exclusive_entry(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device * exclusive entries currently only support private writable * (ie. COW) mappings. */ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) set_pte_at(dst_mm, addr, dst_pte, make_pte_marker(marker)); return 0; } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } /* * Copy a present and normal page. * * NOTE! The usual case is that this isn't required; * instead, the caller can just increase the page refcount * and re-use the pte the traditional way. * * And if we need a pre-allocated page but don't yet have * one, return a negative error to let the preallocation * code know so that it can do so outside the page table * lock. */ static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct folio **prealloc, struct page *page) { struct folio *new_folio; pte_t pte; new_folio = *prealloc; if (!new_folio) return -EAGAIN; /* * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) return -EHWPOISON; *prealloc = NULL; __folio_mark_uptodate(new_folio); folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_mkuffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int nr) { struct mm_struct *src_mm = src_vma->vm_mm; /* If it's a COW mapping, write protect it both processes. */ if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) { wrprotect_ptes(src_mm, addr, src_pte, nr); pte = pte_wrprotect(pte); } /* If it's a shared mapping, mark it clean in the child. */ if (src_vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr); } /* * Copy one present PTE, trying to batch-process subsequent PTEs that map * consecutive pages of the same folio by copying them as well. * * Returns -EAGAIN if one preallocated page is required to copy the next PTE. * Otherwise, returns the number of copied PTEs (at least 1). */ static inline int copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { struct page *page; struct folio *folio; bool any_writable; fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); if (unlikely(!page)) goto copy_pte; folio = page_folio(page); /* * If we likely have to copy, just don't bother with batching. Make * sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { if (src_vma->vm_flags & VM_SHARED) flags |= FPB_IGNORE_DIRTY; if (!vma_soft_dirty_enabled(src_vma)) flags |= FPB_IGNORE_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr, dst_vma, src_vma))) { folio_ref_sub(folio, nr); return -EAGAIN; } rss[MM_ANONPAGES] += nr; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } if (any_writable) pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; } folio_get(folio); if (folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); return err ? err : 1; } rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { folio_dup_file_rmap_pte(folio, page, dst_vma); rss[mm_counter_file(folio)]++; } copy_pte: __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1); return 1; } static inline struct folio *folio_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, unsigned long addr, bool need_zero) { struct folio *new_folio; if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { folio_put(new_folio); return NULL; } folio_throttle_swaprate(new_folio, GFP_KERNEL); return new_folio; } static int copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; pmd_t dummy_pmdval; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct folio *prealloc = NULL; int nr; again: progress = 0; init_rss_vec(rss); /* * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the * error handling here, assume that exclusive mmap_lock on dst and src * protects anon from unexpected THP transitions; with shmem and file * protected by mmap_lock-less collapse skipping areas with anon_vma * (whereas vma_needs_copy() skips areas without anon_vma). A rework * can remove such assumptions later, but this is good enough for now. */ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; } /* * We already hold the exclusive mmap_lock, the copy_pte_range() and * retract_page_tables() are using vma->anon_vma to be exclusive, so * the PTE page is stable, and there is no need to get pmdval and do * pmd_same() check. */ src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval, &src_ptl); if (!src_pte) { pte_unmap_unlock(dst_pte, dst_ptl); /* ret == 0 */ goto out; } spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { nr = 1; /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ if (progress >= 32) { progress = 0; if (need_resched() || spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } ptent = ptep_get(src_pte); if (pte_none(ptent)) { progress++; continue; } if (unlikely(!pte_present(ptent))) { ret = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (ret == -EIO) { entry = pte_to_swp_entry(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; } else if (!ret) { progress += 8; continue; } ptent = ptep_get(src_pte); VM_WARN_ON_ONCE(!pte_present(ptent)); /* * Device exclusive entry restored, continue by copying * the now present pte. */ WARN_ON_ONCE(ret != -ENOENT); } /* copy_present_ptes() will clear `*prealloc' if consumed */ max_nr = (end - addr) / PAGE_SIZE; ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, ptent, addr, max_nr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. * If copy failed due to hwpoison in source page, break out. */ if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) break; if (unlikely(prealloc)) { /* * pre-alloc page cannot be reused by next time so as * to strictly follow mempolicy (e.g., alloc_page_vma() * will allocate page according to address). This * could only happen if one pinned pte changed. */ folio_put(prealloc); prealloc = NULL; } nr = ret; progress += 8 * nr; } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(orig_src_pte, src_ptl); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } entry.val = 0; } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { goto out; } else if (ret == -EAGAIN) { prealloc = folio_prealloc(src_mm, src_vma, addr, false); if (!prealloc) return -ENOMEM; } else if (ret < 0) { VM_WARN_ON_ONCE(1); } /* We've captured and resolved the error. Reset, try again. */ ret = 0; if (addr != end) goto again; out: if (unlikely(prealloc)) folio_put(prealloc); return ret; } static inline int copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } static inline int copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } /* * Return true if the vma needs to copy the pgtable during this fork(). Return * false when we can speed up fork() by allowing lazy page faults later until * when the child accesses the memory range. */ static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { /* * Always copy pgtables when dst_vma has uffd-wp enabled even if it's * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable * contains uffd-wp protection information, that's something we can't * retrieve from page cache, and skip copying will lose those info. */ if (userfaultfd_wp(dst_vma)) return true; if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return true; if (src_vma->anon_vma) return true; /* * Don't copy ptes where a page fault will fill them correctly. Fork * becomes much lighter when there are big shared or private readonly * mappings. The tradeoff is that copy_page_range is more efficient * than faulting. */ return false; } int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; unsigned long next, pfn = 0; bool is_cow; int ret; if (!vma_needs_copy(dst_vma, src_vma)) return 0; if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { ret = track_pfn_copy(dst_vma, src_vma, &pfn); if (ret) return ret; } /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as * the read side doesn't spin, but goes to the mmap_lock. * * Use the raw variant of the seqcount_t write API to avoid * lockdep complaining about preemptibility. */ vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) { raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) untrack_pfn_copy(dst_vma, pfn); return ret; } /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ return details->even_cows; } /* Decides whether we should zap this folio with the folio pointer specified */ static inline bool should_zap_folio(struct zap_details *details, struct folio *folio) { /* If we can make a decision without *folio.. */ if (should_zap_cows(details)) return true; /* Otherwise we should only zap non-anon folios */ return !folio_test_anon(folio); } static inline bool zap_drop_markers(struct zap_details *details) { if (!details) return false; return details->zap_flags & ZAP_FLAG_DROP_MARKER; } /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. * * Returns true if uffd-wp ptes was installed, false otherwise. */ static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { bool was_installed = false; #ifdef CONFIG_PTE_MARKER_UFFD_WP /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) return false; if (zap_drop_markers(details)) return false; for (;;) { /* the PFN in the PTE is irrelevant. */ if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } #endif return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; if (!folio_test_anon(folio)) { ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); if (pte_dirty(ptent)) { folio_mark_dirty(folio); if (tlb_delay_rmap(tlb)) { delay_rmap = true; *force_flush = true; } } if (pte_young(ptent) && likely(vma_has_recency(vma))) folio_mark_accessed(folio); rss[mm_counter(folio)] -= nr; } else { /* We don't need up-to-date accessed/dirty bits. */ clear_full_ptes(mm, addr, pte, nr, tlb->fullmm); rss[MM_ANONPAGES] -= nr; } /* Checking a single PTE in a batch is sufficient. */ arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; } } /* * Zap or skip at least one present PTE, trying to batch-process subsequent * PTEs that map consecutive pages of the same folio. * * Returns the number of processed (skipped or zapped) PTEs (at least 1). */ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; int nr; page = vm_normal_page(vma, addr, ptent); if (!page) { /* We don't need up-to-date accessed/dirty bits. */ ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) { *any_skipped = true; return 1; } /* * Make sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, details, rss, force_flush, force_break, any_skipped); return 1; } static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *any_skipped) { swp_entry_t entry; int nr = 1; *any_skipped = true; entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry) || is_device_exclusive_entry(entry)) { struct page *page = pfn_swap_entry_to_page(entry); struct folio *folio = page_folio(page); if (unlikely(!should_zap_folio(details, folio))) return 1; /* * Both device private/exclusive mappings should only * work with anonymous page so far, so we don't need to * consider uffd-wp bit when zap. For more information, * see zap_install_uffd_wp_if_needed(). */ WARN_ON_ONCE(!vma_is_anonymous(vma)); rss[mm_counter(folio)]--; folio_remove_rmap_pte(folio, page, vma); folio_put(folio); } else if (!non_swap_entry(entry)) { /* Genuine swap entries, hence a private anon pages */ if (!should_zap_cows(details)) return 1; nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; free_swap_and_cache_nr(entry, nr); } else if (is_migration_entry(entry)) { struct folio *folio = pfn_swap_entry_folio(entry); if (!should_zap_folio(details, folio)) return 1; rss[mm_counter(folio)]--; } else if (pte_marker_entry_uffd_wp(entry)) { /* * For anon: always drop the marker; for file: only * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) return 1; } else if (is_guard_swp_entry(entry)) { /* * Ordinary zapping should not remove guard PTE * markers. Only do so if we should remove PTE markers * in general. */ if (!zap_drop_markers(details)) return 1; } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) return 1; } else { /* We should have covered all the swap entry types */ pr_alert("unrecognized swap entry 0x%lx\n", entry.val); WARN_ON_ONCE(1); } clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); return nr; } static inline int do_zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, unsigned long addr, unsigned long end, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { pte_t ptent = ptep_get(pte); int max_nr = (end - addr) / PAGE_SIZE; int nr = 0; /* Skip all consecutive none ptes */ if (pte_none(ptent)) { for (nr = 1; nr < max_nr; nr++) { ptent = ptep_get(pte + nr); if (!pte_none(ptent)) break; } max_nr -= nr; if (!max_nr) return nr; pte += nr; addr += nr * PAGE_SIZE; } if (pte_present(ptent)) nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, details, rss, force_flush, force_break, any_skipped); else nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, details, rss, any_skipped); return nr; } static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { bool force_flush = false, force_break = false; struct mm_struct *mm = tlb->mm; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; pmd_t pmdval; unsigned long start = addr; bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); bool direct_reclaim = true; int nr; retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return addr; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { bool any_skipped = false; if (need_resched()) { direct_reclaim = false; break; } nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, &force_flush, &force_break, &any_skipped); if (any_skipped) can_reclaim_pt = false; if (unlikely(force_break)) { addr += nr * PAGE_SIZE; direct_reclaim = false; break; } } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); /* * Fast path: try to hold the pmd lock and unmap the PTE page. * * If the pte lock was released midway (retry case), or if the attempt * to hold the pmd lock failed, then we need to recheck all pte entries * to ensure they are still none, thereby preventing the pte entries * from being repopulated by another thread. */ if (can_reclaim_pt && direct_reclaim && addr == end) direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_rmaps(tlb, vma); } pte_unmap_unlock(start_pte, ptl); /* * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched * memory too. Come back again if we didn't do everything. */ if (force_flush) tlb_flush_mmu(tlb); if (addr != end) { cond_resched(); force_flush = false; force_break = false; goto retry; } if (can_reclaim_pt) { if (direct_reclaim) free_pte(mm, start, tlb, pmdval); else try_to_free_pte(mm, pmd, start, tlb); } return addr; } static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; } /* fall through */ } else if (details && details->single_folio && folio_test_pmd_mappable(details->single_folio) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* * Take and drop THP pmd lock so that we cannot return * prematurely, while zap_huge_pmd() has cleared *pmd, * but not yet decremented compound_mapcount(). */ spin_unlock(ptl); } if (pmd_none(*pmd)) { addr = next; continue; } addr = zap_pte_range(tlb, vma, pmd, addr, next, details); if (addr != next) pmd--; } while (pmd++, cond_resched(), addr != end); return addr; } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); } while (pud++, addr = next, addr != end); return addr; } static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); return addr; } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) { pgd_t *pgd; unsigned long next; BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; if (start >= vma->vm_end) return; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) return; if (vma->vm_file) uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); } } else unmap_page_range(tlb, vma, start, end, details); } } /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather * @mas: the maple state * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long tree_end, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, /* Careful - we need to zap private pages too! */ .even_cows = true, }; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); unmap_single_vma(tlb, vma, start, end, &details, mm_wr_locked); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); mmu_notifier_invalidate_range_end(&range); } /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of shared cache invalidation * * The range must fit into one VMA. */ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); hugetlb_zap_end(vma, details); } /** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * */ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (!range_in_vma(vma, address, address + size) || !(vma->vm_flags & VM_PFNMAP)) return; zap_page_range_single(vma, address, size, NULL); } EXPORT_SYMBOL_GPL(zap_vma_ptes); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pmd; } pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pmd_t *pmd = walk_to_pmd(mm, addr); if (!pmd) return NULL; return pte_alloc_map_lock(mm, pmd, addr, ptl); } static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma) { VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); /* * Whoever wants to forbid the zeropage after some zeropages * might already have been mapped has to scan the page tables and * bail out on any zeropages. Zeropages in COW mappings can * be unshared using FAULT_FLAG_UNSHARE faults. */ if (mm_forbids_zeropage(vma->vm_mm)) return false; /* zeropages in COW mappings are common and unproblematic. */ if (is_cow_mapping(vma->vm_flags)) return true; /* Mappings that do not allow for writable PTEs are unproblematic. */ if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) return true; /* * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could * find the shared zeropage and longterm-pin it, which would * be problematic as soon as the zeropage gets replaced by a different * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would * now differ to what GUP looked up. FSDAX is incompatible to * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see * check_vma_flags). */ return vma->vm_ops && vma->vm_ops->pfn_mkwrite && (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); } static int validate_page_before_insert(struct vm_area_struct *vma, struct page *page) { struct folio *folio = page_folio(page); if (!folio_ref_count(folio)) return -EINVAL; if (unlikely(is_zero_folio(folio))) { if (!vm_mixed_zeropage_allowed(vma)) return -EINVAL; return 0; } if (folio_test_anon(folio) || folio_test_slab(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; } static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite) { struct folio *folio = page_folio(page); pte_t pteval = ptep_get(pte); if (!pte_none(pteval)) { if (!mkwrite) return -EBUSY; /* see insert_pfn(). */ if (pte_pfn(pteval) != page_to_pfn(page)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval))); return -EFAULT; } pteval = maybe_mkwrite(pteval, vma); pteval = pte_mkyoung(pteval); if (ptep_set_access_flags(vma, addr, pte, pteval, 1)) update_mmu_cache(vma, addr, pte); return 0; } /* Ok, finally just insert the thing.. */ pteval = mk_pte(page, prot); if (unlikely(is_zero_folio(folio))) { pteval = pte_mkspecial(pteval); } else { folio_get(folio); pteval = mk_pte(page, prot); if (mkwrite) { pteval = pte_mkyoung(pteval); pteval = maybe_mkwrite(pte_mkdirty(pteval), vma); } inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); } set_pte_at(vma->vm_mm, addr, pte, pteval); return 0; } static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite) { int retval; pte_t *pte; spinlock_t *ptl; retval = validate_page_before_insert(vma, page); if (retval) goto out; retval = -ENOMEM; pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, mkwrite); pte_unmap_unlock(pte, ptl); out: return retval; } static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; err = validate_page_before_insert(vma, page); if (err) return err; return insert_page_into_pte_locked(vma, pte, addr, page, prot, false); } /* insert_pages() amortizes the cost of spinlock operations * when inserting pages in a loop. */ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; pte_t *start_pte, *pte; spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; unsigned long pages_to_write_in_pmd; int ret; more: ret = -EFAULT; pmd = walk_to_pmd(mm, addr); if (!pmd) goto out; pages_to_write_in_pmd = min_t(unsigned long, remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); /* Allocate the PTE if necessary; takes PMD lock once only. */ ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); if (!start_pte) { ret = -EFAULT; goto out; } for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; } addr += PAGE_SIZE; ++curr_page_idx; } pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } if (remaining_pages_total) goto more; ret = 0; out: *num = remaining_pages_total; return ret; } /** * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. * @vma: user vma to map to * @addr: target start user address of these pages * @pages: source kernel pages * @num: in: number of pages to map. out: number of pages that were *not* * mapped. (0 means all pages were successfully mapped). * * Preferred over vm_insert_page() when inserting multiple pages. * * In case of error, we may have mapped a subset of the provided * pages. It is the caller's responsibility to account for this case. * * The same restrictions apply as in vm_insert_page(). */ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; if (addr < vma->vm_start || end_addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_pages); /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to * @addr: target user address of this page * @page: source kernel page * * This allows drivers to insert individual pages they've allocated * into a user vma. The zeropage is supported in some VMAs, * see vm_mixed_zeropage_allowed(). * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow * that. Your vma protection will have to be set up correctly, which * means that if you want a shared writable mapping, you'd better * ask for a shared writable mapping! * * The page does not need to be reserved. * * Usually this function is called from f_op->mmap() handler * under mm->mmap_lock write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. * * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot, false); } EXPORT_SYMBOL(vm_insert_page); /* * __vm_map_pages - maps range of kernel pages into user vma * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * @offset: user's requested vm_pgoff * * This allows drivers to map range of kernel pages into a user vma. * The zeropage is supported in some VMAs, see * vm_mixed_zeropage_allowed(). * * Return: 0 on success and error code otherwise. */ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num, unsigned long offset) { unsigned long count = vma_pages(vma); unsigned long uaddr = vma->vm_start; int ret, i; /* Fail if the user requested offset is beyond the end of the object */ if (offset >= num) return -ENXIO; /* Fail if the user requested size exceeds available object size */ if (count > num - offset) return -ENXIO; for (i = 0; i < count; i++) { ret = vm_insert_page(vma, uaddr, pages[offset + i]); if (ret < 0) return ret; uaddr += PAGE_SIZE; } return 0; } /** * vm_map_pages - maps range of kernel pages starts with non zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Maps an object consisting of @num pages, catering for the user's * requested vm_pgoff * * If we fail to insert any page into the vma, the function will return * immediately leaving any previously inserted pages present. Callers * from the mmap handler may immediately return the error as their caller * will destroy the vma, removing any successfully inserted pages. Other * callers should make their own arrangements for calling unmap_region(). * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, vma->vm_pgoff); } EXPORT_SYMBOL(vm_map_pages); /** * vm_map_pages_zero - map range of kernel pages starts with zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Similar to vm_map_pages(), except that it explicitly sets the offset * to 0. This function is intended for the drivers that did not consider * vm_pgoff. * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, 0); } EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; entry = ptep_get(pte); if (!pte_none(entry)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared * mapping and we expect the PFNs to match. If they * don't match, we are likely racing with block * allocation and mapping invalidation so just skip the * update. */ if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); } set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ out_unlock: pte_unmap_unlock(pte, ptl); return VM_FAULT_NOPAGE; } /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * * This is exactly like vmf_insert_pfn(), except that it allows drivers * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for * COW mappings. In general, using multiple vmas is preferable; * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. * * pgprot typically only differs from @vma->vm_page_prot when drivers set * caching- and encryption bits different than those of @vma->vm_page_prot, * because the caching- or encryption mode may not be known at mmap() time. * * This is ok as long as @vma->vm_page_prot is not used by the core vm * to set caching and encryption bits for those vmas (except for COW pages). * This is ensured by core vm only modifying these page table entries using * functions that don't touch caching- or encryption bits, using pte_modify() * if needed. (See for example mprotect()). * * Also when new page-table entries are created, this is only done using the * fault() callback, and never using the value of vma->vm_page_prot, * except for page-table entries that point to anonymous pages as the result * of COW. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like * consistency in testing and feature parity among all, so we should * try to keep these invariants in place for everybody. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); /** * vmf_insert_pfn - insert single pfn into user vma * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return the result of this function. * * vma cannot be a COW mapping. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) { if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; if (pfn_t_devmap(pfn)) return true; if (pfn_t_special(pfn)) return true; if (is_zero_pfn(pfn_t_to_pfn(pfn))) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; if (!vm_mixed_ok(vma, pfn, mkwrite)) return VM_FAULT_SIGBUS; if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* * At this point we are committed to insert_page() * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, bool write) { pgprot_t pgprot = vmf->vma->vm_page_prot; unsigned long addr = vmf->address; int err; if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) return VM_FAULT_SIGBUS; err = insert_page(vmf->vma, addr, page, pgprot, write); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(ptep_get(pte))); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(mapped_pte, ptl); return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pmd_t *pmd; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pmd++, addr = next, addr != end); return 0; } static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pud_t *pud; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = remap_pmd_range(mm, pud, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pud++, addr = next, addr != end); return 0; } static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { p4d_t *p4d; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); err = remap_pud_range(mm, p4d, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (p4d++, addr = next, addr != end); return 0; } static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; /* * Physically remapped pages are special. Tell the * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. * VM_DONTEXPAND * Disable vma merging and expanding with mremap(). * VM_DONTDUMP * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ if (is_cow_mapping(vma->vm_flags)) { if (addr != vma->vm_start || end != vma->vm_end) return -EINVAL; vma->vm_pgoff = pfn; } vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pgd++, addr = next, addr != end); return 0; } /* * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); if (!error) return 0; /* * A partial pfn range mapping is dangerous: it does not * maintain page reference counts, and callers may free * pages due to the error. So zap it early. */ zap_page_range_single(vma, addr, size, NULL); return error; } /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target page aligned user address to start at * @pfn: page frame number of kernel physical memory address * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. * * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int err; err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to * @start: start of the physical memory to be mapped * @len: size of area * * This is a simplified io_remap_pfn_range() for common driver use. The * driver just needs to give us the physical memory range to be mapped, * we'll figure out the rest from the vma information. * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. * * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { unsigned long vm_len, pfn, pages; /* Check that the physical memory area passed in looks valid */ if (start + len < start) return -EINVAL; /* * You *really* shouldn't map things that aren't page-aligned, * but we've historically allowed it because IO memory might * just have smaller alignment. */ len += start & ~PAGE_MASK; pfn = start >> PAGE_SHIFT; pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; if (pfn + pages < pfn) return -EINVAL; /* We start the mapping 'vm_pgoff' pages into the area */ if (vma->vm_pgoff > pages) return -EINVAL; pfn += vma->vm_pgoff; pages -= vma->vm_pgoff; /* Can we fit all of the mapping? */ vm_len = vma->vm_end - vma->vm_start; if (vm_len >> PAGE_SHIFT > pages) return -EINVAL; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); } EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pte_t *pte, *mapped_pte; int err = 0; spinlock_t *ptl; if (create) { mapped_pte = pte = (mm == &init_mm) ? pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; } else { mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); if (!pte) return -EINVAL; } arch_enter_lazy_mmu_mode(); if (fn) { do { if (create || !pte_none(ptep_get(pte))) { err = fn(pte, addr, data); if (err) break; } } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); if (mm != &init_mm) pte_unmap_unlock(mapped_pte, ptl); return err; } static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int err = 0; BUG_ON(pud_leaf(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { pmd = pmd_offset(pud, addr); } do { next = pmd_addr_end(addr, end); if (pmd_none(*pmd) && !create) continue; if (WARN_ON_ONCE(pmd_leaf(*pmd))) return -EINVAL; if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { if (!create) continue; pmd_clear_bad(pmd); } err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create, mask); if (err) break; } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0; if (create) { pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { pud = pud_offset(p4d, addr); } do { next = pud_addr_end(addr, end); if (pud_none(*pud) && !create) continue; if (WARN_ON_ONCE(pud_leaf(*pud))) return -EINVAL; if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { if (!create) continue; pud_clear_bad(pud); } err = apply_to_pmd_range(mm, pud, addr, next, fn, data, create, mask); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0; if (create) { p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { p4d = p4d_offset(pgd, addr); } do { next = p4d_addr_end(addr, end); if (p4d_none(*p4d) && !create) continue; if (WARN_ON_ONCE(p4d_leaf(*p4d))) return -EINVAL; if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { if (!create) continue; p4d_clear_bad(p4d); } err = apply_to_pud_range(mm, p4d, addr, next, fn, data, create, mask); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data, bool create) { pgd_t *pgd; unsigned long start = addr, next; unsigned long end = addr + size; pgtbl_mod_mask mask = 0; int err = 0; if (WARN_ON(addr >= end)) return -EINVAL; pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; if (WARN_ON_ONCE(pgd_leaf(*pgd))) { err = -EINVAL; break; } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; pgd_clear_bad(pgd); } err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, start + size); return err; } /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. */ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, true); } EXPORT_SYMBOL_GPL(apply_to_page_range); /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. * * Unlike apply_to_page_range, this does _not_ fill in page tables * where they are absent. */ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, false); } /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures * or configurations (e.g. i386 with PAE) which might give a mix of unmatched * parts, do_swap_page must check under lock before unmapping the pte and * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct vm_fault *vmf) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spin_lock(vmf->ptl); same = pte_same(ptep_get(vmf->pte), vmf->orig_pte); spin_unlock(vmf->ptl); } #endif pte_unmap(vmf->pte); vmf->pte = NULL; return same; } /* * Return: * 0: copied succeeded * -EHWPOISON: copy failed due to hwpoison in source page * -EAGAIN: copied failed (some other reason) */ static inline int __wp_page_copy_user(struct page *dst, struct page *src, struct vm_fault *vmf) { int ret; void *kaddr; void __user *uaddr; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; if (likely(src)) { if (copy_mc_user_highpage(dst, src, addr, vma)) return -EHWPOISON; return 0; } /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ kaddr = kmap_local_page(dst); pagefault_disable(); uaddr = (void __user *)(addr & PAGE_MASK); /* * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ vmf->pte = NULL; if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); } /* * This really shouldn't fail, because the page is there * in the page tables. But it might just be unreadable, * in which case we just give up and fill the result with * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { if (vmf->pte) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ if (vmf->pte) update_mmu_tlb(vma, addr, vmf->pte); ret = -EAGAIN; goto pte_unlock; } /* * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* * Give a warn in case there can be some obscure * use-case */ warn: WARN_ON_ONCE(1); clear_page(kaddr); } } ret = 0; pte_unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); pagefault_enable(); kunmap_local(kaddr); flush_dcache_page(dst); return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) { struct file *vm_file = vma->vm_file; if (vm_file) return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; /* * Special mappings (e.g. VDSO) do not have any file so fake * a default GFP_KERNEL for them. */ return GFP_KERNEL; } /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. * * We do this without the lock held, so that it can sleep if it needs to. */ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio) { vm_fault_t ret; unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; if (vmf->vma->vm_file && IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) return VM_FAULT_SIGBUS; ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { folio_lock(folio); if (!folio->mapping) { folio_unlock(folio); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); return ret; } /* * Handle dirtying of a page in shared file mapping on a write fault. * * The function expects the page to be locked and unlocks it. */ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; struct folio *folio = page_folio(vmf->page); bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; dirtied = folio_mark_dirty(folio); VM_BUG_ON_FOLIO(folio_test_anon(folio), folio); /* * Take a local copy of the address_space - folio.mapping may be zeroed * by truncate after folio_unlock(). The address_space itself remains * pinned by vma->vm_file's reference. We rely on folio_unlock()'s * release semantics to prevent the compiler from undoing this copying. */ mapping = folio_raw_mapping(folio); folio_unlock(folio); if (!page_mkwrite) file_update_time(vma->vm_file); /* * Throttle page dirtying rate down to writeback speed. * * mapping may be NULL here because some device drivers do not * set page.mapping but still dirty their pages * * Drop the mmap_lock before waiting on IO, if we can. The file * is pinning the mapping, as per above. */ if ((dirtied || page_mkwrite) && mapping) { struct file *fpin; fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); if (fpin) { fput(fpin); return VM_FAULT_COMPLETED; } } return 0; } /* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, * or due to us being the last reference standing to the page. In either * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); if (folio) { VM_BUG_ON(folio_test_anon(folio) && !PageAnonExclusive(vmf->page)); /* * Clear the folio's cpupid information as the existing * information potentially belongs to a now completely * unrelated process. */ folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); } /* * We could add a bitflag somewhere, but for now, we know that all * vm_ops that have a ->map_pages have been audited and don't need * the mmap_lock to be held. */ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) return 0; vma_end_read(vma); return VM_FAULT_RETRY; } /** * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a * fault handler, call this function rather than anon_vma_prepare(). * If this vma does not already have an associated anon_vma and we are * only protected by the per-VMA lock, the caller must retry with the * mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to * determine if this VMA can share its anon_vma, and that's not safe to * do with only the per-VMA lock held for this VMA. * * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM; if (vmf->flags & FAULT_FLAG_VMA_LOCK) mmap_read_unlock(vma->vm_mm); return ret; } /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. * * Called with mmap_lock locked and the old page referenced, but * without the ptl held. * * High level logic flow: * * - Allocate a page, copy the content of the old page to the new one. * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. * - Take the PTL. If the pte changed, bail out and release the allocated page * - If the pte is still the way we remember it, update the page table and all * relevant references. This includes dropping the reference the page-table * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ static vm_fault_t wp_page_copy(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct folio *old_folio = NULL; struct folio *new_folio = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; vm_fault_t ret; bool pfn_is_zero; delayacct_wpcopy_start(); if (vmf->page) old_folio = page_folio(vmf->page); ret = vmf_anon_prepare(vmf); if (unlikely(ret)) goto out; pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte)); new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero); if (!new_folio) goto oom; if (!pfn_is_zero) { int err; err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. * The -EHWPOISON case will not be retried. */ folio_put(new_folio); if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(&new_folio->page, vmf->page); } __folio_mark_uptodate(new_folio); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { if (old_folio) { if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } } else { ksm_might_unmap_zero_page(mm, vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) entry = pte_mksoft_dirty(entry); if (pte_uffd_wp(vmf->orig_pte)) entry = pte_mkuffd_wp(entry); } else { entry = maybe_mkwrite(pte_mkdirty(entry), vma); } /* * Clear the pte entry and flush it first, before updating the * pte with the new entry, to keep TLBs on different CPUs in * sync. This code used to set the new PTE then flush TLBs, but * that left a window where the new PTE could be loaded into * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); BUG_ON(unshare && pte_write(entry)); set_pte_at(mm, vmf->address, vmf->pte, entry); update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * folio_remove_rmap_pte() with the ptp_clear_flush * above. Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in folio_remove_rmap_pte(); * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ folio_remove_rmap_pte(old_folio, vmf->page, vma); } /* Free the old page.. */ new_folio = old_folio; page_copied = 1; pte_unmap_unlock(vmf->pte, vmf->ptl); } else if (vmf->pte) { update_mmu_tlb(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); } mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); if (old_folio) { if (page_copied) free_swap_cache(old_folio); folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom: ret = VM_FAULT_OOM; out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); return ret; } /** * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE * writeable once the page is prepared * * @vmf: structure describing the fault * @folio: the folio of vmf->page * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). * * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } wp_page_reuse(vmf, folio); return 0; } /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); ret = vmf_can_call_fault(vmf); if (ret) return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf, NULL); } wp_page_reuse(vmf, NULL); return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; folio_get(folio); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = vmf_can_call_fault(vmf); if (tmp) { folio_put(folio); return tmp; } tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } tmp = finish_mkwrite_fault(vmf, folio); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { folio_unlock(folio); folio_put(folio); return tmp; } } else { wp_page_reuse(vmf, folio); folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); folio_put(folio); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static bool __wp_can_reuse_large_anon_folio(struct folio *folio, struct vm_area_struct *vma) { bool exclusive = false; /* Let's just free up a large folio if only a single page is mapped. */ if (folio_large_mapcount(folio) <= 1) return false; /* * The assumption for anonymous folios is that each page can only get * mapped once into each MM. The only exception are KSM folios, which * are always small. * * Each taken mapcount must be paired with exactly one taken reference, * whereby the refcount must be incremented before the mapcount when * mapping a page, and the refcount must be decremented after the * mapcount when unmapping a page. * * If all folio references are from mappings, and all mappings are in * the page tables of this MM, then this folio is exclusive to this MM. */ if (folio_test_large_maybe_mapped_shared(folio)) return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); if (unlikely(folio_test_swapcache(folio))) { /* * Note: freeing up the swapcache will fail if some PTEs are * still swap entries. */ if (!folio_trylock(folio)) return false; folio_free_swap(folio); folio_unlock(folio); } if (folio_large_mapcount(folio) != folio_ref_count(folio)) return false; /* Stabilize the mapcount vs. refcount and recheck. */ folio_lock_large_mapcount(folio); VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio)); if (folio_test_large_maybe_mapped_shared(folio)) goto unlock; if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && folio_mm_id(folio, 1) != vma->vm_mm->mm_id); /* * Do we need the folio lock? Likely not. If there would have been * references from page migration/swapout, we would have detected * an additional folio reference and never ended up here. */ exclusive = true; unlock: folio_unlock_large_mapcount(folio); return exclusive; } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, struct vm_area_struct *vma) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static bool wp_can_reuse_anon_folio(struct folio *folio, struct vm_area_struct *vma) { if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio)) return __wp_can_reuse_large_anon_folio(folio, vma); /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * * KSM doesn't necessarily raise the folio refcount. */ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) return false; if (!folio_test_lru(folio)) /* * We cannot easily detect+handle references from * remote LRU caches or references to LRU folios. */ lru_add_drain(); if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) return false; if (!folio_trylock(folio)) return false; if (folio_test_swapcache(folio)) folio_free_swap(folio); if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { folio_unlock(folio); return false; } /* * Ok, we've got the only folio reference from our mapping * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ folio_move_anon_rmap(folio, vma); folio_unlock(folio); return true; } /* * This routine handles present pages, when * * users try to write to a shared page (FAULT_FLAG_WRITE) * * GUP wants to take a R/O pin on a possibly shared anonymous page * (FAULT_FLAG_UNSHARE) * * It is done by copying the page to a new address and decrementing the * shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've * done any necessary COW. * * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even * though the page will change only once the write actually happens. This * avoids a few races, and potentially makes it more efficient. * * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { if (!userfaultfd_wp_async(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } /* * Nothing needed (cache flush, TLB invalidations, * etc.) because we're only removing the uffd-wp bit, * which is completely invisible to the user. */ pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); /* * Update this to be prepared for following up CoW * handling */ vmf->orig_pte = pte; } /* * Userfaultfd write-protect can defer flushes. Ensure the TLB * is flushed in this case before copying. */ if (unlikely(userfaultfd_wp(vmf->vma) && mm_tlb_flush_pending(vmf->vma->vm_mm))) flush_tlb_page(vmf->vma, vmf->address); } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (vmf->page) folio = page_folio(vmf->page); /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. */ if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if (!vmf->page || is_fsdax_page(vmf->page)) { vmf->page = NULL; return wp_pfn_shared(vmf); } return wp_page_shared(vmf, folio); } /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. * * If we encounter a page that is marked exclusive, we must reuse * the page without further checks. */ if (folio && folio_test_anon(folio) && (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { if (!PageAnonExclusive(vmf->page)) SetPageAnonExclusive(vmf->page); if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } wp_page_reuse(vmf, folio); return 0; } /* * Ok, we need to copy. Oh, well.. */ if (folio) folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, pgoff_t first_index, pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; zba = max(first_index, vba); zea = min(last_index, vea); unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details); } } /** * unmap_mapping_folio() - Unmap single folio from processes. * @folio: The locked folio to be unmapped. * * Unmap this folio from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once * truncation or invalidation holds the lock on a folio, it may find that * the page has been remapped again: and then uses unmap_mapping_folio() * to unmap it finally. */ void unmap_mapping_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; struct zap_details details = { }; pgoff_t first_index; pgoff_t last_index; VM_BUG_ON(!folio_test_locked(folio)); first_index = folio->index; last_index = folio_next_index(folio) - 1; details.even_cows = false; details.single_folio = folio; details.zap_flags = ZAP_FLAG_DROP_MARKER; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } /** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. * @nr: Number of pages to be unmapped. 0 to unmap to end of file. * @even_cows: Whether to unmap even private COWed pages. * * Unmap the pages in this address space from any userspace process which * has them mmaped. Generally, you want to remove COWed pages as well when * a file is being truncated, but not when invalidating pages from the page * cache. */ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; details.even_cows = even_cows; if (last_index < first_index) last_index = ULONG_MAX; i_mmap_lock_read(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, first_index, last_index, &details); i_mmap_unlock_read(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); /** * unmap_mapping_range - unmap the portion of all mmaps in the specified * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded * up to a PAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. * @even_cows: 1 when truncating a file, unmap even private COWed pages; * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); /* * Restore a potential device exclusive pte to a working pte entry */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; vm_fault_t ret; /* * We need a reference to lock the folio because we don't hold * the PTL so a racing thread can remove the device-exclusive * entry and unmap it. If the folio is free the entry must * have been removed already. If it happens to have already * been re-allocated after being freed all we do is lock and * unlock it. */ if (!folio_try_get(folio)) return 0; ret = folio_lock_or_retry(folio, vmf); if (ret) { folio_put(folio); return ret; } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) restore_exclusive_pte(vma, folio, vmf->page, vmf->address, vmf->pte, vmf->orig_pte); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); folio_unlock(folio); folio_put(folio); mmu_notifier_invalidate_range_end(&range); return 0; } static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { if (!folio_test_swapcache(folio)) return false; if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && folio_ref_count(folio) == (1 + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) return 0; /* * Be careful so that we will only recover a special uffd-wp pte into a * none pte. Otherwise it means the pte could have changed, so retry. * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. * So is_pte_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } static vm_fault_t do_pte_missing(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); else return do_fault(vmf); } /* * This is actually a page-missing access, but with uffd-wp special pte * installed. It means this pte was wr-protected before being unmapped. */ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) { /* * Just in case there're leftover special ptes even after the region * got unregistered - we can simply clear them. */ if (unlikely(!userfaultfd_wp(vmf->vma))) return pte_marker_clear(vmf); return do_pte_missing(vmf); } static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); unsigned long marker = pte_marker_get(entry); /* * PTE markers should never be empty. If anything weird happened, * the best thing to do is to kill the process along with its mm. */ if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; /* Higher priority than uffd-wp when data corrupted */ if (marker & PTE_MARKER_POISONED) return VM_FAULT_HWPOISON; /* Hitting a guard page is always a fatal condition. */ if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ return VM_FAULT_SIGBUS; } static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; swp_entry_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; entry = pte_to_swp_entry(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); return NULL; } return folio; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); int i; /* * While allocating a large folio and doing swap_read_folio, which is * the case the being faulted pte doesn't have swapcache. We need to * ensure all PTEs have no cache as well, otherwise, we might go to * swap devices while the content is in swapcache. */ for (i = 0; i < max_nr; i++) { if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) return i; } return i; } /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. */ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; swp_entry_t entry; int idx; pte_t pte; addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); idx = (vmf->address - addr) / PAGE_SIZE; pte = ptep_get(ptep); if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; entry = pte_to_swp_entry(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; /* * swap_read_folio() can't handle the case a large folio is hybridly * from different backends. And they are likely corner cases. Similar * things might be added once zswap support large folios. */ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) return false; if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) return false; return true; } static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, unsigned long addr, unsigned long orders) { int order, nr; order = highest_order(orders); /* * To swap in a THP with nr pages, we require that its first swap_offset * is aligned with that number, as it was when the THP was swapped out. * This helps filter out most invalid entries. */ while (orders) { nr = 1 << order; if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) break; order = next_order(&orders, order); } return orders; } static struct folio *alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long orders; struct folio *folio; unsigned long addr; swp_entry_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * A large swapped out folio could be partially or fully in zswap. We * lack handling for such cases, so fallback to swapping in order-0 * folio. */ if (!zswap_never_enabled()) goto fallback; entry = pte_to_swp_entry(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address & PMD_MASK, &ptl); if (unlikely(!pte)) goto fallback; /* * For do_swap_page, find the highest order where the aligned range is * completely swap entries with contiguous swap offsets. */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap_unlock(pte, ptl); /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } fallback: return __alloc_swap_folio(vmf); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *alloc_swap_folio(struct vm_fault *vmf) { return __alloc_swap_folio(vmf); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * * We return with the mmap_lock locked or unlocked in the same cases * as does filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *swapcache, *folio = NULL; DECLARE_WAITQUEUE(wait, current); struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; swp_entry_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; int nr_pages; unsigned long page_idx; unsigned long address; pte_t *ptep; if (!pte_unmap_same(vmf)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_exclusive_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate * under VMA lock. */ vma_end_read(vma); ret = VM_FAULT_RETRY; goto out; } vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto unlock; /* * Get a page reference while we know the page can't be * freed. */ if (trylock_page(vmf->page)) { struct dev_pagemap *pgmap; get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); pgmap = page_pgmap(vmf->page); ret = pgmap->ops->migrate_to_ram(vmf); unlock_page(vmf->page); put_page(vmf->page); } else { pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; } /* Prevent swapoff from happening to us. */ si = get_swap_device(entry); if (unlikely(!si)) goto out; folio = swap_cache_get_folio(entry, vma, vmf->address); if (folio) page = folio_file_page(folio, swp_offset(entry)); swapcache = folio; if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); nr_pages = folio_nr_pages(folio); if (folio_test_large(folio)) entry.val = ALIGN_DOWN(entry.val, nr_pages); /* * Prevent parallel swapin from proceeding with * the cache flag. Otherwise, another thread * may finish swapin first, free the entry, and * swapout reusing the same entry. It's * undetectable as pte_same() returns true due * to entry reuse. */ if (swapcache_prepare(entry, nr_pages)) { /* * Relax a bit to prevent rapid * repeated page faults. */ add_wait_queue(&swapcache_wq, &wait); schedule_timeout_uninterruptible(1); remove_wait_queue(&swapcache_wq, &wait); goto out_page; } need_clear_cache = true; memcg1_swapin(entry, nr_pages); shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(folio, shadow); folio_add_lru(folio); /* To provide entry to swap_read_folio() */ folio->swap = entry; swap_read_folio(folio, NULL); folio->private = NULL; } } else { folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = folio; } if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) ret = VM_FAULT_OOM; goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); page = folio_file_page(folio, swp_offset(entry)); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not * changed. */ if (unlikely(!folio_test_swapcache(folio) || page_swap_entry(page).val != entry.val)) goto out_page; /* * KSM sometimes has to copy on read faults, for example, if * page->index of !PageKSM() pages would be nonlinear inside the * anon VMA -- PageKSM() is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { ret = VM_FAULT_OOM; folio = swapcache; goto out_page; } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) { ret = VM_FAULT_HWPOISON; folio = swapcache; goto out_page; } if (folio != swapcache) page = folio_page(folio, 0); /* * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * owner. Try removing the extra reference from the local LRU * caches if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } folio_throttle_swaprate(folio, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) goto out_nomap; if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } /* allocated large folios for SWP_SYNCHRONOUS_IO */ if (folio_test_large(folio) && !folio_test_swapcache(folio)) { unsigned long nr = folio_nr_pages(folio); unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; pte_t *folio_ptep = vmf->pte - idx; pte_t folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto out_nomap; page_idx = idx; address = folio_start; ptep = folio_ptep; goto check_folio; } nr_pages = 1; page_idx = 0; address = vmf->address; ptep = vmf->pte; if (folio_test_large(folio) && folio_test_swapcache(folio)) { int nr = folio_nr_pages(folio); unsigned long idx = folio_page_idx(folio, page); unsigned long folio_start = address - idx * PAGE_SIZE; unsigned long folio_end = folio_start + nr * PAGE_SIZE; pte_t *folio_ptep; pte_t folio_pte; if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) goto check_folio; if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) goto check_folio; folio_ptep = vmf->pte - idx; folio_pte = ptep_get(folio_ptep); if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || swap_pte_batch(folio_ptep, nr, folio_pte) != nr) goto check_folio; page_idx = idx; address = folio_start; ptep = folio_ptep; nr_pages = nr; entry = folio->swap; page = &folio->page; } check_folio: /* * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte * must never point at an anonymous page in the swapcache that is * PG_anon_exclusive. Sanity check that this holds and especially, that * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { exclusive = pte_swp_exclusive(vmf->orig_pte); if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. */ exclusive = true; } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support * concurrent page modifications while under writeback. * * So if we stumble over such a page in the swapcache * we must not set the page exclusive, otherwise we can * map it writable without further checks and modify it * while still under writeback. * * For these problematic swap backends, simply drop the * exclusive marker: this is perfectly fine as we start * writeback only if we fully unmapped the page and * there are no unexpected references on the page after * unmapping succeeded. After fully unmapped, no * further GUP references (FOLL_GET and FOLL_PIN) can * appear, so dropping the exclusive marker and mapping * it only R/O is fine. */ exclusive = false; } } /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry * so this must be called before swap_free(). */ arch_swap_restore(folio_swap(entry, folio), folio); /* * Remove the swap entry and conditionally try to free up the swapcache. * We're already holding a reference on the page but haven't mapped it * yet. */ swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) pte = pte_mkuffd_wp(pte); /* * Same logic as in do_wp_page(); however, optimize for pages that are * certainly not shared either because we just allocated them without * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ if (!folio_test_ksm(folio) && (exclusive || folio_ref_count(folio) == 1)) { if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && !pte_needs_soft_dirty_wp(vma, pte)) { pte = pte_mkwrite(pte, vma); if (vmf->flags & FAULT_FLAG_WRITE) { pte = pte_mkdirty(pte); vmf->flags &= ~FAULT_FLAG_WRITE; } } rmap_flags |= RMAP_EXCLUSIVE; } folio_ref_add(folio, nr_pages - 1); flush_icache_pages(vma, page, nr_pages); vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* * We currently only expect small !anon folios which are either * fully exclusive or fully shared, or new allocated large * folios which are fully exclusive. If we ever get large * folios within swapcache here, we have to be careful. */ VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); arch_do_swap_page_nr(vma->vm_mm, vma, address, pte, pte, nr_pages); folio_unlock(folio); if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For * further safety release the lock after the swap_free * so that the swap count won't change under a * parallel locked swapcache. */ folio_unlock(swapcache); folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, address, ptep, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: /* Clear the swap cache pin for direct swapin after PTL unlock */ if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; out_nomap: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: folio_unlock(folio); out_release: folio_put(folio); if (folio != swapcache && swapcache) { folio_unlock(swapcache); folio_put(swapcache); } if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); if (waitqueue_active(&swapcache_wq)) wake_up(&swapcache_wq); } if (si) put_swap_device(si); return ret; } static bool pte_range_none(pte_t *pte, int nr_pages) { int i; for (i = 0; i < nr_pages; i++) { if (!pte_none(ptep_get_lockless(pte + i))) return false; } return true; } static struct folio *alloc_anon_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long orders; struct folio *folio; unsigned long addr; pte_t *pte; gfp_t gfp; int order; /* * If uffd is active for the vma we need per-page fault fidelity to * maintain the uffd semantics. */ if (unlikely(userfaultfd_armed(vma))) goto fallback; /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) goto fallback; pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK); if (!pte) return ERR_PTR(-EAGAIN); /* * Find the highest order where the aligned range is completely * pte_none(). Note that all remaining orders will be completely * pte_none(). */ order = highest_order(orders); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); if (pte_range_none(pte + pte_index(addr), 1 << order)) break; order = next_order(&orders, order); } pte_unmap(pte); if (!orders) goto fallback; /* Try allocating the highest of the remaining orders. */ gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); folio_put(folio); goto next; } folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation * (__GFP_ZERO not used) or user folios require special * handling, folio_zero_user() is used to make sure * that the page corresponding to the faulting address * will be hot in the cache after zeroing. */ if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } next: count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); order = next_order(&orders, order); } fallback: #endif return folio_prealloc(vma->vm_mm, vma, vmf->address, true); } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; struct folio *folio; vm_fault_t ret = 0; int nr_pages = 1; pte_t entry; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* * Use pte_alloc() instead of pte_alloc_map(), so that OOM can * be distinguished from a transient failure of pte_offset_map(). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!vmf->pte) goto unlock; if (vmf_pte_changed(vmf)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } /* Allocate our own private page. */ ret = vmf_anon_prepare(vmf); if (ret) return ret; /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */ folio = alloc_anon_folio(vmf); if (IS_ERR(folio)) return 0; if (!folio) goto oom; nr_pages = folio_nr_pages(folio); addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) goto release; if (nr_pages == 1 && vmf_pte_changed(vmf)) { update_mmu_tlb(vma, addr, vmf->pte); goto release; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); goto release; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) entry = pte_mkuffd_wp(entry); set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages); /* No need to invalidate - it was non-present before */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: folio_put(folio); goto unlock; oom: return VM_FAULT_OOM; } /* * The mmap_lock must have been held on entry, and may have been * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; /* * Preallocate pte before we take page_lock because this might lead to * deadlocks for memcg reclaim which waits for pages under writeback: * lock_page(A) * SetPageWriteback(A) * unlock_page(A) * lock_page(B) * lock_page(B) * pte_alloc_one * shrink_folio_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; folio = page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { vm_fault_t poisonret = VM_FAULT_HWPOISON; if (ret & VM_FAULT_LOCKED) { if (page_mapped(vmf->page)) unmap_mapping_folio(folio); /* Retry if a clean folio was removed from the cache. */ if (mapping_evict_folio(folio->mapping, folio)) poisonret = VM_FAULT_NOPAGE; folio_unlock(folio); } folio_put(folio); vmf->page = NULL; return poisonret; } if (unlikely(!(ret & VM_FAULT_LOCKED))) folio_lock(folio); else VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page); return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; /* * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any * PMD mappings if THPs are disabled. */ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; if (folio_order(folio) != HPAGE_PMD_ORDER) return ret; page = &folio->page; /* * Just backoff if any subpage of a THP is corrupted otherwise * the corrupted page may mapped by PMD silently to escape the * check. This kind of THP just can be PTE mapped. Access to * the corrupted subpage should trigger SIGBUS as expected. */ if (unlikely(folio_test_has_hwpoisoned(folio))) return ret; /* * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) goto out; flush_icache_pages(vma, page, HPAGE_PMD_NR); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR); folio_add_file_rmap_pmd(folio, page, vma); /* * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) deposit_prealloc_pte(vmf); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: spin_unlock(vmf->ptl); return ret; } #else vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { return VM_FAULT_FALLBACK; } #endif /** * set_pte_range - Set a range of PTEs to point to pages in a folio. * @vmf: Fault decription. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. * @addr: The first address to create a PTE for. */ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); entry = mk_pte(page, vma->vm_page_prot); if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); else entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); } set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); } static bool vmf_pte_changed(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) return !pte_same(ptep_get(vmf->pte), vmf->orig_pte); return !pte_none(ptep_get(vmf->pte)); } /** * finish_fault - finish page fault once we have prepared the page to fault * * @vmf: structure describing the fault * * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). * * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; struct folio *folio; vm_fault_t ret; bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; unsigned long addr; bool needs_fallback = false; fallback: addr = vmf->address; /* Did we COW the page? */ if (is_cow) page = vmf->cow_page; else page = vmf->page; /* * check even for read faults because we might have lost our CoWed * page */ if (!(vma->vm_flags & VM_SHARED)) { ret = check_stable_address_space(vma->vm_mm); if (ret) return ret; } if (pmd_none(*vmf->pmd)) { if (PageTransCompound(page)) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } if (vmf->prealloc_pte) pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) return VM_FAULT_OOM; } folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* * Using per-page fault to maintain the uffd semantics, and same * approach also applies to non-anonymous-shmem faults to avoid * inflating the RSS of the process. */ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; /* The index of the entry in the pagetable for fault page. */ pgoff_t pte_off = pte_index(vmf->address); /* * Fallback to per-page fault in case the folio size in page * cache beyond the VMA limits and PMD pagetable limits. */ if (unlikely(vma_off < idx || vma_off + (nr_pages - idx) > vma_pages(vma) || pte_off < idx || pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { nr_pages = 1; } else { /* Now we can set mappings for the whole large folio. */ addr = vmf->address - idx * PAGE_SIZE; page = &folio->page; } } vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* Re-check under ptl */ if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) { update_mmu_tlb(vma, addr, vmf->pte); ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { needs_fallback = true; pte_unmap_unlock(vmf->pte, vmf->ptl); goto fallback; } folio_ref_add(folio, nr_pages - 1); set_pte_range(vmf, folio, page, nr_pages, addr); type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); add_mm_counter(vma->vm_mm, type, nr_pages); ret = 0; unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } static unsigned long fault_around_pages __read_mostly = 65536 >> PAGE_SHIFT; #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { *val = fault_around_pages << PAGE_SHIFT; return 0; } /* * fault_around_bytes must be rounded down to the nearest page order as it's * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; /* * The minimum value is 1 page, however this results in no fault-around * at all. See should_fault_around(). */ val = max(val, PAGE_SIZE); fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); #endif /* * do_fault_around() tries to map few pages around the fault address. The hope * is that the pages will be needed soon and this will lower the number of * faults to handle. * * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * * This function doesn't cross VMA or page table boundaries, in order to call * map_pages() and acquire a PTE lock only once. * * fault_around_pages defines how many pages we'll try to map. * do_fault_around() expects it to be set to a power of two less than or equal * to PTRS_PER_PTE. * * The virtual address of the area that we map is naturally aligned to * fault_around_pages * PAGE_SIZE rounded down to the machine page size * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ static vm_fault_t do_fault_around(struct vm_fault *vmf) { pgoff_t nr_pages = READ_ONCE(fault_around_pages); pgoff_t pte_off = pte_index(vmf->address); /* The page offset of vmf->address within the VMA. */ pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; pgoff_t from_pte, to_pte; vm_fault_t ret; /* The PTE offset of the start address, clamped to the VMA. */ from_pte = max(ALIGN_DOWN(pte_off, nr_pages), pte_off - min(pte_off, vma_off)); /* The PTE offset of the end address, clamped to the VMA and PTE. */ to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE, pte_off + vma_pages(vmf->vma) - vma_off) - 1; if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; } rcu_read_lock(); ret = vmf->vma->vm_ops->map_pages(vmf, vmf->pgoff + from_pte - pte_off, vmf->pgoff + to_pte - pte_off); rcu_read_unlock(); return ret; } /* Return true if we should do read fault-around, false otherwise */ static inline bool should_fault_around(struct vm_fault *vmf) { /* No ->map_pages? No way to fault around... */ if (!vmf->vma->vm_ops->map_pages) return false; if (uffd_disable_fault_around(vmf->vma)) return false; /* A single page implies no faulting 'around' at all. */ return fault_around_pages > 1; } static vm_fault_t do_read_fault(struct vm_fault *vmf) { vm_fault_t ret = 0; struct folio *folio; /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or * something). */ if (should_fault_around(vmf)) { ret = do_fault_around(vmf); if (ret) return ret; } ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; ret |= finish_fault(vmf); folio = page_folio(vmf->page); folio_unlock(folio); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) folio_put(folio); return ret; } static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; vm_fault_t ret; ret = vmf_can_call_fault(vmf); if (!ret) ret = vmf_anon_prepare(vmf); if (ret) return ret; folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false); if (!folio) return VM_FAULT_OOM; vmf->cow_page = &folio->page; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (ret & VM_FAULT_DONE_COW) return ret; if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { ret = VM_FAULT_HWPOISON; goto unlock; } __folio_mark_uptodate(folio); ret |= finish_fault(vmf); unlock: unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: folio_put(folio); return ret; } static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; struct folio *folio; ret = vmf_can_call_fault(vmf); if (ret) return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; folio = page_folio(vmf->page); /* * Check if the backing address space wants to know that the page is * about to become writable */ if (vma->vm_ops->page_mkwrite) { folio_unlock(folio); tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); return tmp; } } ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { folio_unlock(folio); folio_put(folio); return ret; } ret |= fault_dirty_shared_page(vmf); return ret; } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults). * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). * If mmap_lock is released, vma may become invalid (for example * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) { vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) ret = VM_FAULT_SIGBUS; else { /* * Make sure this is not a temporary clearing of pte * by holding ptl and checking again. A R/M/W update * of pte involves: take ptl, clearing the pte so that * we don't have concurrent modification by hardware * followed by an update. */ if (unlikely(pte_none(ptep_get(vmf->pte)))) ret = VM_FAULT_SIGBUS; else ret = VM_FAULT_NOPAGE; pte_unmap_unlock(vmf->pte, vmf->ptl); } } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); else ret = do_shared_fault(vmf); /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; } int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid) { struct vm_area_struct *vma = vmf->vma; /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses * the case where a mapping is writable but the process never writes * to it but pte_write gets cleared during protection updates and * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ if (!writable) *flags |= TNF_NO_GROUP; /* * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) *flags |= TNF_SHARED; /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if (folio_use_access_time(folio)) *last_cpupid = (-1 & LAST_CPUPID_MASK); else *last_cpupid = folio_last_cpupid(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); #ifdef CONFIG_NUMA_BALANCING count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1); #endif if (folio_nid(folio) == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } return mpol_misplaced(folio, vmf, addr); } static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long fault_addr, pte_t *fault_pte, bool writable) { pte_t pte, old_pte; old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (writable) pte = pte_mkwrite(pte, vma); ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte); update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1); } static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma, struct folio *folio, pte_t fault_pte, bool ignore_writable, bool pte_write_upgrade) { int nr = pte_pfn(fault_pte) - folio_pfn(folio); unsigned long start, end, addr = vmf->address; unsigned long addr_start = addr - (nr << PAGE_SHIFT); unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE); pte_t *start_ptep; /* Stay within the VMA and within the page table. */ start = max3(addr_start, pt_start, vma->vm_start); end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE, vma->vm_end); start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT); /* Restore all PTEs' mapping of the large folio */ for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) { pte_t ptent = ptep_get(start_ptep); bool writable = false; if (!pte_present(ptent) || !pte_protnone(ptent)) continue; if (pfn_folio(pte_pfn(ptent)) != folio) continue; if (!ignore_writable) { ptent = pte_modify(ptent, vma->vm_page_prot); writable = pte_write(ptent); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, addr, ptent)) writable = true; } numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable); } } static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; int nid = NUMA_NO_NODE; bool writable = false, ignore_writable = false; bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma); int last_cpupid; int target_nid; pte_t pte, old_pte; int flags = 0, nr_pages; /* * The pte cannot be used safely until we verify, while holding the page * table lock, that its contents have not changed during fault handling. */ spin_lock(vmf->ptl); /* Read the live PTE from the page tables: */ old_pte = ptep_get(vmf->pte); if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); /* * Detect now whether the PTE could be writable; this information * is only valid while holding the PT lock. */ writable = pte_write(pte); if (!writable && pte_write_upgrade && can_change_pte_writable(vma, vmf->address, pte)) writable = true; folio = vm_normal_folio(vma, vmf->address, pte); if (!folio || folio_is_zone_device(folio)) goto out_map; nid = folio_nid(folio); nr_pages = folio_nr_pages(folio); target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, writable, &last_cpupid); if (target_nid == NUMA_NO_NODE) goto out_map; if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { flags |= TNF_MIGRATE_FAIL; goto out_map; } /* The folio is isolated and isolation code holds a folio reference. */ pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; ignore_writable = true; /* Migrate to the requested node */ if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } flags |= TNF_MIGRATE_FAIL; vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } out_map: /* * Make it present again, depending on how arch implements * non-accessible ptes, some can allow access by kernel mode. */ if (folio && folio_test_large(folio)) numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable, pte_write_upgrade); else numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); if (nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, nid, nr_pages, flags); return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; } /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; vm_fault_t ret; if (vma_is_anonymous(vma)) { if (likely(!unshare) && userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { if (userfaultfd_wp_async(vmf->vma)) goto split; return handle_userfault(vmf, VM_UFFD_WP); } return do_huge_pmd_wp_page(vmf); } if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify not handled on PUD level: split pud.*/ __split_huge_pud(vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow * concurrent faults). * * The mmap_lock may have been released depending on flags and our return value. * See filemap_fault() and __folio_lock_or_retry(). */ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ vmf->pte = NULL; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { pmd_t dummy_pmdval; /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. * * Use the maywrite version to indicate that vmf->pte may be * modified, but since we will use pte_same() to detect the * change of the !pte_none() entry, there is no need to recheck * the pmdval. Here we chooes to pass a dummy variable instead * of NULL, which helps new user think about why this place is * special. */ vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &dummy_pmdval, &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } } if (!vmf->pte) return do_pte_missing(vmf); if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); goto unlock; } if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!pte_write(entry)) return do_wp_page(vmf); else if (likely(vmf->flags & FAULT_FLAG_WRITE)) entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); } else { /* Skip spurious TLB flush for retried page fault */ if (vmf->flags & FAULT_FLAG_TRIED) goto unlock; /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (vmf->flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, vmf->pte); } unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* * On entry, we hold either the VMA lock or the mmap_lock * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in * the result, the mmap_lock is not held on exit. See filemap_fault() * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) return VM_FAULT_OOM; vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pud_t orig_pud = *vmf.pud; barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and * FAULT_FLAG_UNSHARE handling. */ if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { ret = wp_huge_pud(&vmf, orig_pud); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pud_set_accessed(&vmf, orig_pud); return 0; } } } vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; /* Huge pud page fault raced with pmd_alloc? */ if (pud_trans_unstable(vmf.pud)) goto retry_pud; if (pmd_none(*vmf.pmd) && thp_vma_allowable_order(vma, vm_flags, TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); if (unlikely(is_swap_pmd(vmf.orig_pmd))) { VM_BUG_ON(thp_migration_supported() && !is_pmd_migration_entry(vmf.orig_pmd)); if (is_pmd_migration_entry(vmf.orig_pmd)) pmd_migration_entry_wait(mm, vmf.pmd); return 0; } if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !pmd_write(vmf.orig_pmd)) { ret = wp_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pmd_set_accessed(&vmf); return 0; } } } return handle_pte_fault(&vmf); } /** * mm_account_fault - Do page fault accounting * @mm: mm from which memcg should be extracted. It can be NULL. * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to * the task who triggered this page fault. * @address: the faulted address. * @flags: the fault flags. * @ret: the fault retcode. * * This will take care of most of the page fault accounting. Meanwhile, it * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should * still be in per-arch page fault handlers at the entry of page fault. */ static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs, unsigned long address, unsigned int flags, vm_fault_t ret) { bool major; /* Incomplete faults will be accounted upon completion. */ if (ret & VM_FAULT_RETRY) return; /* * To preserve the behavior of older kernels, PGFAULT counters record * both successful and failed faults, as opposed to perf counters, * which ignore failed cases. */ count_vm_event(PGFAULT); count_memcg_event_mm(mm, PGFAULT); /* * Do not account for unsuccessful faults (e.g. when the address wasn't * valid). That includes arch_vma_access_permitted() failing before * reaching here. So this is not a "this many hardware page faults" * counter. We should use the hw profiling for that. */ if (ret & VM_FAULT_ERROR) return; /* * We define the fault as a major fault when the final successful fault * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't * handle it immediately previously). */ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); if (major) current->maj_flt++; else current->min_flt++; /* * If the fault is done for GUP, regs will be NULL. We only do the * accounting for the per thread fault counters who triggered the * fault, and we skip the perf event updates. */ if (!regs) return; if (major) perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); else perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } #ifdef CONFIG_LRU_GEN static void lru_gen_enter_fault(struct vm_area_struct *vma) { /* the LRU algorithm only applies to accesses with recency */ current->in_lru_fault = vma_has_recency(vma); } static void lru_gen_exit_fault(void) { current->in_lru_fault = false; } #else static void lru_gen_enter_fault(struct vm_area_struct *vma) { } static void lru_gen_exit_fault(void) { } #endif /* CONFIG_LRU_GEN */ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, unsigned int *flags) { if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) return VM_FAULT_SIGSEGV; /* * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's * just treat it like an ordinary read-fault otherwise. */ if (!is_cow_mapping(vma->vm_flags)) *flags &= ~FAULT_FLAG_UNSHARE; } else if (*flags & FAULT_FLAG_WRITE) { /* Write faults on read-only mappings are impossible ... */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) return VM_FAULT_SIGSEGV; /* ... and FOLL_FORCE only applies to COW mappings. */ if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } #ifdef CONFIG_PER_VMA_LOCK /* * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of * the assumption that lock is dropped on VM_FAULT_RETRY. */ if (WARN_ON_ONCE((*flags & (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) return VM_FAULT_SIGSEGV; #endif return 0; } /* * By the time we get here, we already hold either the VMA lock or the * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; bool is_droppable; __set_current_state(TASK_RUNNING); ret = sanitize_fault_flags(vma, &flags); if (ret) goto out; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) { ret = VM_FAULT_SIGSEGV; goto out; } is_droppable = !!(vma->vm_flags & VM_DROPPABLE); /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) mem_cgroup_enter_user_fault(); lru_gen_enter_fault(vma); if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); else ret = __handle_mm_fault(vma, address, flags); /* * Warning: It is no longer safe to dereference vma-> after this point, * because mmap_lock might have been dropped by __handle_mm_fault(), so * vma might be destroyed from underneath us. */ lru_gen_exit_fault(); /* If the mapping is droppable, then errors due to OOM aren't fatal. */ if (is_droppable) ret &= ~VM_FAULT_OOM; if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* * The task may have entered a memcg OOM situation but * if the allocation error was handled gracefully (no * VM_FAULT_OOM), there is no need to kill anything. * Just clean up the OOM state peacefully. */ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) mem_cgroup_oom_synchronize(false); } out: mm_account_fault(mm, regs, address, flags, ret); return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); #ifdef CONFIG_LOCK_MM_AND_FIND_VMA #include <linux/extable.h> static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { if (likely(mmap_read_trylock(mm))) return true; if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_read_lock_killable(mm); } static inline bool mmap_upgrade_trylock(struct mm_struct *mm) { /* * We don't have this operation yet. * * It should be easy enough to do: it's basically a * atomic_long_try_cmpxchg_acquire() * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but * it also needs the proper lockdep magic etc. */ return false; } static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { mmap_read_unlock(mm); if (regs && !user_mode(regs)) { unsigned long ip = exception_ip(regs); if (!search_exception_tables(ip)) return false; } return !mmap_write_lock_killable(mm); } /* * Helper for page fault handling. * * This is kind of equivalent to "mmap_read_lock()" followed * by "find_extend_vma()", except it's a lot more careful about * the locking (and will drop the lock on failure). * * For example, if we have a kernel bug that causes a page * fault, we don't want to just use mmap_read_lock() to get * the mm lock, because that would deadlock if the bug were * to happen while we're holding the mm lock for writing. * * So this checks the exception tables on kernel faults in * order to only do this all for instructions that are actually * expected to fault. * * We can also actually take the mm lock for writing if we * need to extend the vma, which helps the VM layer a lot. */ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long addr, struct pt_regs *regs) { struct vm_area_struct *vma; if (!get_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (likely(vma && (vma->vm_start <= addr))) return vma; /* * Well, dang. We might still be successful, but only * if we can extend a vma to do so. */ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { mmap_read_unlock(mm); return NULL; } /* * We can try to upgrade the mmap lock atomically, * in which case we can continue to use the vma * we already looked up. * * Otherwise we'll have to drop the mmap lock and * re-take it, and also look up the vma again, * re-checking it. */ if (!mmap_upgrade_trylock(mm)) { if (!upgrade_mmap_lock_carefully(mm, regs)) return NULL; vma = find_vma(mm, addr); if (!vma) goto fail; if (vma->vm_start <= addr) goto success; if (!(vma->vm_flags & VM_GROWSDOWN)) goto fail; } if (expand_stack_locked(vma, addr)) goto fail; success: mmap_write_downgrade(mm); return vma; fail: mmap_write_unlock(mm); return NULL; } #endif #ifdef CONFIG_PER_VMA_LOCK static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) { unsigned int tgt_refcnt = VMA_LOCK_OFFSET; /* Additional refcnt if the vma is attached. */ if (!detaching) tgt_refcnt++; /* * If vma is detached then only vma_mark_attached() can raise the * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). */ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) return false; rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, refcount_read(&vma->vm_refcnt) == tgt_refcnt, TASK_UNINTERRUPTIBLE); lock_acquired(&vma->vmlock_dep_map, _RET_IP_); return true; } static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) { *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) { bool locked; /* * __vma_enter_locked() returns false immediately if the vma is not * attached, otherwise it waits until refcnt is indicating that vma * is attached with no readers. */ locked = __vma_enter_locked(vma, false); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); if (locked) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(detached); /* vma should remain attached */ } } EXPORT_SYMBOL_GPL(__vma_start_write); void vma_mark_detached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_attached(vma); /* * We are the only writer, so no need to use vma_refcount_put(). * The condition below is unlikely because the vma has been already * write-locked and readers can increment vm_refcnt only temporarily * before they check vm_lock_seq, realize the vma is locked and drop * back the vm_refcnt. That is a narrow window for observing a raised * vm_refcnt. */ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { /* Wait until vma is detached with no readers. */ if (__vma_enter_locked(vma, true)) { bool detached; __vma_exit_locked(vma, &detached); WARN_ON_ONCE(!detached); } } } /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the * function returns NULL. */ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; rcu_read_lock(); retry: vma = mas_walk(&mas); if (!vma) goto inval; vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { /* Check if the VMA got isolated after we found it */ if (PTR_ERR(vma) == -EAGAIN) { count_vm_vma_lock_event(VMA_LOCK_MISS); /* The area was replaced with another one */ goto retry; } /* Failed to lock the VMA */ goto inval; } /* * At this point, we have a stable reference to a VMA: The VMA is * locked and we know it hasn't already been isolated. * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ /* Check if the vma we locked is the right one. */ if (unlikely(vma->vm_mm != mm || address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); return vma; inval_end_read: vma_end_read(vma); inval: rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } #endif /* CONFIG_PER_VMA_LOCK */ #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. * We've already handled the fast-path in-line. */ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { p4d_t *new = p4d_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); } else { smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); } spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_P4D_FOLDED */ #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * We've already handled the fast-path in-line. */ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { pud_t *new = pud_alloc_one(mm, address); if (!new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. * We've already handled the fast-path in-line. */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); } else { /* Another has populated it */ pmd_free(mm, new); } spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, spinlock_t *lock, pte_t *ptep, pgprot_t pgprot, unsigned long pfn_base, unsigned long addr_mask, bool writable, bool special) { args->lock = lock; args->ptep = ptep; args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); args->addr_mask = addr_mask; args->pgprot = pgprot; args->writable = writable; args->special = special; } static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) { #ifdef CONFIG_LOCKDEP struct file *file = vma->vm_file; struct address_space *mapping = file ? file->f_mapping : NULL; if (mapping) lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || lockdep_is_held(&vma->vm_mm->mmap_lock)); else lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); #endif } /** * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address * @args: Pointer to struct @follow_pfnmap_args * * The caller needs to setup args->vma and args->address to point to the * virtual address as the target of such lookup. On a successful return, * the results will be put into other output fields. * * After the caller finished using the fields, the caller must invoke * another follow_pfnmap_end() to proper releases the locks and resources * of such look up request. * * During the start() and end() calls, the results in @args will be valid * as proper locks will be held. After the end() is called, all the fields * in @follow_pfnmap_args will be invalid to be further accessed. Further * use of such information after end() may require proper synchronizations * by the caller with page table updates, otherwise it can create a * security bug. * * If the PTE maps a refcounted page, callers are responsible to protect * against invalidation with MMU notifiers; otherwise access to the PFN at * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore * should be taken for read, and the mmap semaphore cannot be released * before the end() is invoked. * * This function must not be used to modify PTE content. * * Return: zero on success, negative otherwise. */ int follow_pfnmap_start(struct follow_pfnmap_args *args) { struct vm_area_struct *vma = args->vma; unsigned long address = args->address; struct mm_struct *mm = vma->vm_mm; spinlock_t *lock; pgd_t *pgdp; p4d_t *p4dp, p4d; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; pfnmap_lockdep_assert(vma); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; retry: pgdp = pgd_offset(mm, address); if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; p4dp = p4d_offset(pgdp, address); p4d = READ_ONCE(*p4dp); if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; pudp = pud_offset(p4dp, address); pud = READ_ONCE(*pudp); if (pud_none(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); if (!unlikely(pud_leaf(pud))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), pud_pfn(pud), PUD_MASK, pud_write(pud), pud_special(pud)); return 0; } pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); if (!unlikely(pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), pmd_special(pmd)); return 0; } ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; pte = ptep_get(ptep); if (!pte_present(pte)) goto unlock; pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), pte_pfn(pte), PAGE_MASK, pte_write(pte), pte_special(pte)); return 0; unlock: pte_unmap_unlock(ptep, lock); out: return -EINVAL; } EXPORT_SYMBOL_GPL(follow_pfnmap_start); /** * follow_pfnmap_end(): End a follow_pfnmap_start() process * @args: Pointer to struct @follow_pfnmap_args * * Must be used in pair of follow_pfnmap_start(). See the start() function * above for more information. */ void follow_pfnmap_end(struct follow_pfnmap_args *args) { if (args->lock) spin_unlock(args->lock); if (args->ptep) pte_unmap(args->ptep); } EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access * @addr: userspace address, not relative offset within @vma * @buf: buffer to read/write * @len: length of transfer * @write: set to FOLL_WRITE when writing, otherwise reading * * This is a generic implementation for &vm_operations_struct.access for an * iomem mapping. This callback is used by access_process_vm() when the @vma is * not page based. */ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; pgprot_t prot = __pgprot(0); void __iomem *maddr; int offset = offset_in_page(addr); int ret = -EINVAL; bool writable; struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: if (follow_pfnmap_start(&args)) return -EINVAL; prot = args.pgprot; phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; writable = args.writable; follow_pfnmap_end(&args); if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; if (follow_pfnmap_start(&args)) goto out_unmap; if ((pgprot_val(prot) != pgprot_val(args.pgprot)) || (phys_addr != (args.pfn << PAGE_SHIFT)) || (writable != args.writable)) { follow_pfnmap_end(&args); iounmap(maddr); goto retry; } if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); ret = len; follow_pfnmap_end(&args); out_unmap: iounmap(maddr); return ret; } EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* * Access another process' address space as given in mm. */ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; if (mmap_read_lock_killable(mm)) return 0; /* Untag the address before looking up the VMA */ addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr) && !expand_stack(mm, addr)) return 0; /* ignore errors, just check how much was successfully transferred */ while (len) { int bytes, offset; void *maddr; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { vma = expand_stack(mm, addr); /* mmap_lock was dropped on failure */ if (!vma) return buf - old_buf; /* Try again if stack expansion worked */ continue; } /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ bytes = 0; #ifdef CONFIG_HAVE_IOREMAP_PROT if (vma->vm_ops && vma->vm_ops->access) bytes = vma->vm_ops->access(vma, addr, buf, len, write); #endif if (bytes <= 0) break; } else { bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; maddr = kmap_local_page(page); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } unmap_and_put_page(page, maddr); } len -= bytes; buf += bytes; addr += bytes; } mmap_read_unlock(mm); return buf - old_buf; } /** * access_remote_vm - access another process' address space * @mm: the mm_struct of the target address space * @addr: start address to access * @buf: source or destination buffer * @len: number of bytes to transfer * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. * * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* * Access another process' address space. * Source/target buffer must be kernel space, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; mm = get_task_mm(tsk); if (!mm) return 0; ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(access_process_vm); #ifdef CONFIG_BPF_SYSCALL /* * Copy a string from another process's address space as given in mm. * If there is any error return -EFAULT. */ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int err = 0; *(char *)buf = '\0'; if (mmap_read_lock_killable(mm)) return -EFAULT; addr = untagged_addr_remote(mm, addr); /* Avoid triggering the temporary warning in __get_user_pages */ if (!vma_lookup(mm, addr)) { err = -EFAULT; goto out; } while (len) { int bytes, offset, retval; void *maddr; struct page *page; struct vm_area_struct *vma = NULL; page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); if (IS_ERR(page)) { /* * Treat as a total failure for now until we decide how * to handle the CONFIG_HAVE_IOREMAP_PROT case and * stack expansion. */ *(char *)buf = '\0'; err = -EFAULT; goto out; } bytes = len; offset = addr & (PAGE_SIZE - 1); if (bytes > PAGE_SIZE - offset) bytes = PAGE_SIZE - offset; maddr = kmap_local_page(page); retval = strscpy(buf, maddr + offset, bytes); if (retval >= 0) { /* Found the end of the string */ buf += retval; unmap_and_put_page(page, maddr); break; } buf += bytes - 1; /* * Because strscpy always NUL terminates we need to * copy the last byte in the page if we are going to * load more pages */ if (bytes != len) { addr += bytes - 1; copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); buf += 1; addr += 1; } len -= bytes; unmap_and_put_page(page, maddr); } out: mmap_read_unlock(mm); if (err) return err; return buf - old_buf; } /** * copy_remote_vm_str - copy a string from another process's address space. * @tsk: the task of the target address space * @addr: start address to read from * @buf: destination buffer * @len: number of bytes to copy * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. * * Return: number of bytes copied from @addr (source) to @buf (destination); * not including the trailing NUL. Always guaranteed to leave NUL-terminated * buffer. On any error, return -EFAULT. */ int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; if (unlikely(len == 0)) return 0; mm = get_task_mm(tsk); if (!mm) { *(char *)buf = '\0'; return -EFAULT; } ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags); mmput(mm); return ret; } EXPORT_SYMBOL_GPL(copy_remote_vm_str); #endif /* CONFIG_BPF_SYSCALL */ /* * Print the name of a VMA. */ void print_vma_addr(char *prefix, unsigned long ip) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; /* * we might be running from an atomic context so we cannot sleep */ if (!mmap_read_trylock(mm)) return; vma = vma_lookup(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; ip -= vma->vm_start; ip += vma->vm_pgoff << PAGE_SHIFT; printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip, vma->vm_start, vma->vm_end - vma->vm_start); } mmap_read_unlock(mm); } #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) void __might_fault(const char *file, int line) { if (pagefault_disabled()) return; __might_sleep(file, line); if (current->mm) might_lock_read(&current->mm->mmap_lock); } EXPORT_SYMBOL(__might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) /* * Process all subpages of the specified huge page with the specified * operation. The target subpage will be processed last to keep its * cache lines hot. */ static inline int process_huge_page( unsigned long addr_hint, unsigned int nr_pages, int (*process_subpage)(unsigned long addr, int idx, void *arg), void *arg) { int i, n, base, l, ret; unsigned long addr = addr_hint & ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1); /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; if (2 * n <= nr_pages) { /* If target subpage in first half of huge page */ base = 0; l = n; /* Process subpages at the end of huge page */ for (i = nr_pages - 1; i >= 2 * n; i--) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } else { /* If target subpage in second half of huge page */ base = nr_pages - 2 * (nr_pages - n); l = nr_pages - n; /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) return ret; } } /* * Process remaining subpages in left-right-left-right pattern * towards the target subpage */ for (i = 0; i < l; i++) { int left_idx = base + i; int right_idx = base + 2 * l - 1 - i; cond_resched(); ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); if (ret) return ret; cond_resched(); ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); if (ret) return ret; } return 0; } static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); for (i = 0; i < nr_pages; i++) { cond_resched(); clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } static int clear_subpage(unsigned long addr, int idx, void *arg) { struct folio *folio = arg; clear_user_highpage(folio_page(folio, idx), addr); return 0; } /** * folio_zero_user - Zero a folio which will be mapped to userspace. * @folio: The folio to zero. * @addr_hint: The address will be accessed or the base address if uncelar. */ void folio_zero_user(struct folio *folio, unsigned long addr_hint) { unsigned int nr_pages = folio_nr_pages(folio); if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) clear_gigantic_page(folio, addr_hint, nr_pages); else process_huge_page(addr_hint, nr_pages, clear_subpage, folio); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); src_page = folio_page(src, i); cond_resched(); if (copy_mc_user_highpage(dst_page, src_page, addr + i*PAGE_SIZE, vma)) return -EHWPOISON; } return 0; } struct copy_subpage_arg { struct folio *dst; struct folio *src; struct vm_area_struct *vma; }; static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; struct page *dst = folio_page(copy_arg->dst, idx); struct page *src = folio_page(copy_arg->src, idx); if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) return -EHWPOISON; return 0; } int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma) { unsigned int nr_pages = folio_nr_pages(dst); struct copy_subpage_arg arg = { .dst = dst, .src = src, .vma = vma, }; if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages); return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg); } long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault) { void *kaddr; unsigned long i, rc = 0; unsigned int nr_pages = folio_nr_pages(dst_folio); unsigned long ret_val = nr_pages * PAGE_SIZE; struct page *subpage; for (i = 0; i < nr_pages; i++) { subpage = folio_page(dst_folio, i); kaddr = kmap_local_page(subpage); if (!allow_pagefault) pagefault_disable(); rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE); if (!allow_pagefault) pagefault_enable(); kunmap_local(kaddr); ret_val -= (PAGE_SIZE - rc); if (rc) break; flush_dcache_page(subpage); cond_resched(); } return ret_val; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; void __init ptlock_cache_init(void) { page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, SLAB_PANIC, NULL); } bool ptlock_alloc(struct ptdesc *ptdesc) { spinlock_t *ptl; ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; ptdesc->ptl = ptl; return true; } void ptlock_free(struct ptdesc *ptdesc) { if (ptdesc->ptl) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif void vma_pgtable_walk_begin(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_lock_read(vma); } void vma_pgtable_walk_end(struct vm_area_struct *vma) { if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CPUSET_H #define _LINUX_CPUSET_H /* * cpuset interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/sched/topology.h> #include <linux/sched/task.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/jump_label.h> #ifdef CONFIG_CPUSETS /* * Static branch rewrites can happen in an arbitrary order for a given * key. In code paths where we need to loop with read_mems_allowed_begin() and * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need * to ensure that begin() always gets rewritten before retry() in the * disabled -> enabled transition. If not, then if local irqs are disabled * around the loop, we can deadlock since retry() would always be * comparing the latest value of the mems_allowed seqcount against 0 as * begin() still would see cpusets_enabled() as false. The enabled -> disabled * transition should happen in reverse order for the same reasons (want to stop * looking at real value of mems_allowed.sequence in retry() first). */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; extern struct static_key_false cpusets_insane_config_key; static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); } static inline void cpuset_inc(void) { static_branch_inc_cpuslocked(&cpusets_pre_enable_key); static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec_cpuslocked(&cpusets_enabled_key); static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } /* * This will get enabled whenever a cpuset configuration is considered * unsupportable in general. E.g. movable only node which cannot satisfy * any non movable allocations (see update_nodemask). Page allocator * needs to make additional checks for those configurations and this * check is meant to guard those checks without any overhead for sane * configurations. */ static inline bool cpusets_insane_config(void) { return static_branch_unlikely(&cpusets_insane_config_key); } extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern bool cpuset_node_allowed(int node, gfp_t gfp_mask); static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return cpuset_node_allowed(zone_to_nid(z), gfp_mask); } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { if (cpusets_enabled()) return __cpuset_zone_allowed(z, gfp_mask); return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2); #ifdef CONFIG_CPUSETS_V1 #define cpuset_memory_pressure_bump() \ do { \ if (cpuset_memory_pressure_enabled) \ __cpuset_memory_pressure_bump(); \ } while (0) extern int cpuset_memory_pressure_enabled; extern void __cpuset_memory_pressure_bump(void); #else static inline void cpuset_memory_pressure_bump(void) { } #endif extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { return task_spread_page(current); } extern bool current_cpuset_is_being_rebound(void); extern void dl_rebuild_rd_accounting(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); extern void cpuset_reset_sched_domains(void); /* * read_mems_allowed_begin is required when making decisions involving * mems_allowed such as during page allocation. mems_allowed can be updated in * parallel and depending on the new value an operation can fail potentially * causing process failure. A retry loop with read_mems_allowed_begin and * read_mems_allowed_retry prevents these artificial failures. */ static inline unsigned int read_mems_allowed_begin(void) { if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(&current->mems_allowed_seq); } /* * If this returns true, the operation that took place after * read_mems_allowed_begin may have failed artificially due to a concurrent * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ static inline bool read_mems_allowed_retry(unsigned int seq) { if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(&current->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { unsigned long flags; task_lock(current); local_irq_save(flags); write_seqcount_begin(&current->mems_allowed_seq); current->mems_allowed = nodemask; write_seqcount_end(&current->mems_allowed_seq); local_irq_restore(flags); task_unlock(current); } #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } static inline bool cpusets_insane_config(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} static inline void cpuset_force_rebuild(void) { } static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } static inline void inc_dl_tasks_cs(struct task_struct *task) { } static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { cpumask_copy(mask, task_cpu_possible_mask(p)); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { return false; } static inline bool cpuset_cpu_is_isolated(int cpu) { return false; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; } #define cpuset_current_mems_allowed (node_states[N_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return 1; } static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return 1; } static inline void cpuset_memory_pressure_bump(void) {} static inline void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { } static inline int cpuset_mem_spread_node(void) { return 0; } static inline int cpuset_do_page_mem_spread(void) { return 0; } static inline bool current_cpuset_is_being_rebound(void) { return false; } static inline void dl_rebuild_rd_accounting(void) { } static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_reset_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_print_current_mems_allowed(void) { } static inline void set_mems_allowed(nodemask_t nodemask) { } static inline unsigned int read_mems_allowed_begin(void) { return 0; } static inline bool read_mems_allowed_retry(unsigned int seq) { return false; } #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */
2 2 2 70 70 70 70 70 70 89 89 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> * * Derived from arch/arm/kvm/coproc.h * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Authors: Christoffer Dall <c.dall@virtualopensystems.com> */ #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ #define __ARM64_KVM_SYS_REGS_LOCAL_H__ #include <linux/bsearch.h> #define reg_to_encoding(x) \ sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) struct sys_reg_params { u8 Op0; u8 Op1; u8 CRn; u8 CRm; u8 Op2; u64 regval; bool is_write; }; #define encoding_to_params(reg) \ ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \ .Op1 = sys_reg_Op1(reg), \ .CRn = sys_reg_CRn(reg), \ .CRm = sys_reg_CRm(reg), \ .Op2 = sys_reg_Op2(reg) }) #define esr_sys64_to_params(esr) \ ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ .Op1 = ((esr) >> 14) & 0x7, \ .CRn = ((esr) >> 10) & 0xf, \ .CRm = ((esr) >> 1) & 0xf, \ .Op2 = ((esr) >> 17) & 0x7, \ .is_write = !((esr) & 1) }) #define esr_cp1x_32_to_params(esr) \ ((struct sys_reg_params){ .Op1 = ((esr) >> 14) & 0x7, \ .CRn = ((esr) >> 10) & 0xf, \ .CRm = ((esr) >> 1) & 0xf, \ .Op2 = ((esr) >> 17) & 0x7, \ .is_write = !((esr) & 1) }) struct sys_reg_desc { /* Sysreg string for debug */ const char *name; enum { AA32_DIRECT, AA32_LO, AA32_HI, } aarch32_map; /* MRS/MSR instruction which accesses it. */ u8 Op0; u8 Op1; u8 CRn; u8 CRm; u8 Op2; /* Trapped access from guest, if non-NULL. */ bool (*access)(struct kvm_vcpu *, struct sys_reg_params *, const struct sys_reg_desc *); /* * Initialization for vcpu. Return initialized value, or KVM * sanitized value for ID registers. */ u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); /* Index into sys_reg[], or 0 if we don't need to save it. */ int reg; /* Value (usually reset value), or write mask for idregs */ u64 val; /* Custom get/set_user functions, fallback to generic if NULL */ int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val); /* Return mask of REG_* runtime visibility overrides */ unsigned int (*visibility)(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd); }; #define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */ #define REG_RAZ (1 << 1) /* RAZ from userspace and guest */ #define REG_USER_WI (1 << 2) /* WI from userspace only */ static __printf(2, 3) inline void print_sys_reg_msg(const struct sys_reg_params *p, char *fmt, ...) { va_list va; va_start(va, fmt); /* Look, we even formatted it for you to paste into the table! */ kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n", &(struct va_format){ fmt, &va }, p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read"); va_end(va); } static inline void print_sys_reg_instr(const struct sys_reg_params *p) { /* GCC warns on an empty format string */ print_sys_reg_msg(p, "%s", ""); } static inline bool ignore_write(struct kvm_vcpu *vcpu, const struct sys_reg_params *p) { return true; } static inline bool read_zero(struct kvm_vcpu *vcpu, struct sys_reg_params *p) { p->regval = 0; return true; } /* Reset functions */ static inline u64 reset_unknown(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; return __vcpu_sys_reg(vcpu, r->reg); } static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = r->val; return __vcpu_sys_reg(vcpu, r->reg); } static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (likely(!r->visibility)) return 0; return r->visibility(vcpu, r); } static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return sysreg_visibility(vcpu, r) & REG_HIDDEN; } static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return sysreg_visibility(vcpu, r) & REG_RAZ; } static inline bool sysreg_user_write_ignore(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { return sysreg_visibility(vcpu, r) & REG_USER_WI; } static inline int cmp_sys_reg(const struct sys_reg_desc *i1, const struct sys_reg_desc *i2) { BUG_ON(i1 == i2); if (!i1) return 1; else if (!i2) return -1; if (i1->Op0 != i2->Op0) return i1->Op0 - i2->Op0; if (i1->Op1 != i2->Op1) return i1->Op1 - i2->Op1; if (i1->CRn != i2->CRn) return i1->CRn - i2->CRn; if (i1->CRm != i2->CRm) return i1->CRm - i2->CRm; return i1->Op2 - i2->Op2; } static inline int match_sys_reg(const void *key, const void *elt) { const unsigned long pval = (unsigned long)key; const struct sys_reg_desc *r = elt; return pval - reg_to_encoding(r); } static inline const struct sys_reg_desc * find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], unsigned int num) { unsigned long pval = reg_to_encoding(params); return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num); int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index); int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu); #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x #define CRn(_x) .CRn = _x #define CRm(_x) .CRm = _x #define Op2(_x) .Op2 = _x #define SYS_DESC(reg) \ .name = #reg, \ Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) #define CP15_SYS_DESC(reg) \ .name = #reg, \ .aarch32_map = AA32_DIRECT, \ Op0(0), Op1(sys_reg_Op1(reg)), \ CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ Op2(sys_reg_Op2(reg)) #define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit) \ ({ \ u64 __f_val = FIELD_GET(reg##_##field##_MASK, val); \ (val) &= ~reg##_##field##_MASK; \ (val) |= FIELD_PREP(reg##_##field##_MASK, \ min(__f_val, \ (u64)SYS_FIELD_VALUE(reg, field, limit))); \ (val); \ }) #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ /* * Type definitions for the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. */ #ifndef _SS_MLS_TYPES_H_ #define _SS_MLS_TYPES_H_ #include "security.h" #include "ebitmap.h" struct mls_level { u32 sens; /* sensitivity */ struct ebitmap cat; /* category set */ }; struct mls_range { struct mls_level level[2]; /* low == level[0], high == level[1] */ }; static inline int mls_level_eq(const struct mls_level *l1, const struct mls_level *l2) { return ((l1->sens == l2->sens) && ebitmap_equal(&l1->cat, &l2->cat)); } static inline int mls_level_dom(const struct mls_level *l1, const struct mls_level *l2) { return ((l1->sens >= l2->sens) && ebitmap_contains(&l1->cat, &l2->cat, 0)); } #define mls_level_incomp(l1, l2) \ (!mls_level_dom((l1), (l2)) && !mls_level_dom((l2), (l1))) #define mls_level_between(l1, l2, l3) \ (mls_level_dom((l1), (l2)) && mls_level_dom((l3), (l1))) #define mls_range_contains(r1, r2) \ (mls_level_dom(&(r2).level[0], &(r1).level[0]) && \ mls_level_dom(&(r1).level[1], &(r2).level[1])) #endif /* _SS_MLS_TYPES_H_ */
23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_NET_H #define _LINUX_VIRTIO_NET_H #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/udp.h> #include <uapi/linux/tcp.h> #include <uapi/linux/virtio_net.h> static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) { switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: return protocol == cpu_to_be16(ETH_P_IP); case VIRTIO_NET_HDR_GSO_TCPV6: return protocol == cpu_to_be16(ETH_P_IPV6); case VIRTIO_NET_HDR_GSO_UDP: case VIRTIO_NET_HDR_GSO_UDP_L4: return protocol == cpu_to_be16(ETH_P_IP) || protocol == cpu_to_be16(ETH_P_IPV6); default: return false; } } static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, const struct virtio_net_hdr *hdr) { if (skb->protocol) return 0; switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: case VIRTIO_NET_HDR_GSO_UDP_L4: skb->protocol = cpu_to_be16(ETH_P_IP); break; case VIRTIO_NET_HDR_GSO_TCPV6: skb->protocol = cpu_to_be16(ETH_P_IPV6); break; default: return -EINVAL; } return 0; } static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { unsigned int nh_min_len = sizeof(struct iphdr); unsigned int gso_type = 0; unsigned int thlen = 0; unsigned int p_off = 0; unsigned int ip_proto; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: gso_type = SKB_GSO_TCPV4; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); break; case VIRTIO_NET_HDR_GSO_TCPV6: gso_type = SKB_GSO_TCPV6; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); nh_min_len = sizeof(struct ipv6hdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; case VIRTIO_NET_HDR_GSO_UDP_L4: gso_type = SKB_GSO_UDP_L4; ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; default: return -EINVAL; } if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) gso_type |= SKB_GSO_TCP_ECN; if (hdr->gso_size == 0) return -EINVAL; } skb_reset_mac_header(skb); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start); u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); if (!pskb_may_pull(skb, needed)) return -EINVAL; if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; if (skb_transport_offset(skb) < nh_min_len) return -EINVAL; nh_min_len = skb_transport_offset(skb); p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. * probe and drop if does not match one of the above types. */ if (gso_type && skb->network_header) { struct flow_keys_basic keys; if (!skb->protocol) { __be16 protocol = dev_parse_header_protocol(skb); if (!protocol) virtio_net_hdr_set_proto(skb, hdr); else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type)) return -EINVAL; else skb->protocol = protocol; } retry: if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) { /* UFO does not specify ipv4 or 6: try both */ if (gso_type & SKB_GSO_UDP && skb->protocol == htons(ETH_P_IP)) { skb->protocol = htons(ETH_P_IPV6); goto retry; } return -EINVAL; } p_off = keys.control.thoff + thlen; if (!pskb_may_pull(skb, p_off) || keys.basic.ip_proto != ip_proto) return -EINVAL; skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); switch (gso_type & ~SKB_GSO_TCP_ECN) { case SKB_GSO_UDP: /* UFO may not include transport header in gso_size. */ nh_off -= thlen; break; case SKB_GSO_UDP_L4: if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) return -EINVAL; if (skb->csum_offset != offsetof(struct udphdr, check)) return -EINVAL; if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS) return -EINVAL; if (gso_type != SKB_GSO_UDP_L4) return -EINVAL; break; case SKB_GSO_TCPV4: case SKB_GSO_TCPV6: if (skb->ip_summed == CHECKSUM_PARTIAL && skb->csum_offset != offsetof(struct tcphdr, check)) return -EINVAL; break; } /* Kernel has a special handling for GSO_BY_FRAGS. */ if (gso_size == GSO_BY_FRAGS) return -EINVAL; /* Too small packets are not really GSO ones. */ if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; /* Header must be checked, and gso_segs computed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } } return 0; } static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, bool has_data_valid, int vlan_hlen) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ if (skb_is_gso(skb)) { struct skb_shared_info *sinfo = skb_shinfo(skb); /* This is a hint as to how much should be linear. */ hdr->hdr_len = __cpu_to_virtio16(little_endian, skb_headlen(skb)); hdr->gso_size = __cpu_to_virtio16(little_endian, sinfo->gso_size); if (sinfo->gso_type & SKB_GSO_TCPV4) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; else if (sinfo->gso_type & SKB_GSO_UDP_L4) hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; else return -EINVAL; if (sinfo->gso_type & SKB_GSO_TCP_ECN) hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->csum_start = __cpu_to_virtio16(little_endian, skb_checksum_start_offset(skb) + vlan_hlen); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); } else if (has_data_valid && skb->ip_summed == CHECKSUM_UNNECESSARY) { hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; } #endif /* _LINUX_VIRTIO_NET_H */
19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 /* * linux/include/linux/console.h * * Copyright (C) 1993 Hamish Macdonald * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * * Changed: * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX */ #ifndef _LINUX_CONSOLE_H_ #define _LINUX_CONSOLE_H_ 1 #include <linux/atomic.h> #include <linux/bits.h> #include <linux/irq_work.h> #include <linux/rculist.h> #include <linux/rcuwait.h> #include <linux/types.h> #include <linux/vesa.h> struct vc_data; struct console_font_op; struct console_font; struct module; struct tty_struct; struct notifier_block; enum con_scroll { SM_UP, SM_DOWN, }; enum vc_intensity; /** * struct consw - callbacks for consoles * * @owner: the module to get references of when this console is used * @con_startup: set up the console and return its name (like VGA, EGA, ...) * @con_init: initialize the console on @vc. @init is true for the very first * call on this @vc. * @con_deinit: deinitialize the console from @vc. * @con_clear: erase @count characters at [@x, @y] on @vc. @count >= 1. * @con_putc: emit one character with attributes @ca to [@x, @y] on @vc. * (optional -- @con_putcs would be called instead) * @con_putcs: emit @count characters with attributes @s to [@x, @y] on @vc. * @con_cursor: enable/disable cursor depending on @enable * @con_scroll: move lines from @top to @bottom in direction @dir by @lines. * Return true if no generic handling should be done. * Invoked by csi_M and printing to the console. * @con_switch: notifier about the console switch; it is supposed to return * true if a redraw is needed. * @con_blank: blank/unblank the console. The target mode is passed in @blank. * @mode_switch is set if changing from/to text/graphics. The hook * is supposed to return true if a redraw is needed. * @con_font_set: set console @vc font to @font with height @vpitch. @flags can * be %KD_FONT_FLAG_DONT_RECALC. (optional) * @con_font_get: fetch the current font on @vc of height @vpitch into @font. * (optional) * @con_font_default: set default font on @vc. @name can be %NULL or font name * to search for. @font can be filled back. (optional) * @con_resize: resize the @vc console to @width x @height. @from_user is true * when this change comes from the user space. * @con_set_palette: sets the palette of the console @vc to @table (optional) * @con_scrolldelta: the contents of the console should be scrolled by @lines. * Invoked by user. (optional) * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not * provided or returns false, the origin is set to * @vc->vc_screenbuf. (optional) * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g. * upon entering graphics. (optional) * @con_build_attr: build attributes based on @color, @intensity and other * parameters. The result is used for both normal and erase * characters. (optional) * @con_invert_region: invert a region of length @count on @vc starting at @p. * (optional) * @con_debug_enter: prepare the console for the debugger. This includes, but * is not limited to, unblanking the console, loading an * appropriate palette, and allowing debugger generated output. * (optional) * @con_debug_leave: restore the console to its pre-debug state as closely as * possible. (optional) */ struct consw { struct module *owner; const char *(*con_startup)(void); void (*con_init)(struct vc_data *vc, bool init); void (*con_deinit)(struct vc_data *vc); void (*con_clear)(struct vc_data *vc, unsigned int y, unsigned int x, unsigned int count); void (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y, unsigned int x); void (*con_putcs)(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos); void (*con_cursor)(struct vc_data *vc, bool enable); bool (*con_scroll)(struct vc_data *vc, unsigned int top, unsigned int bottom, enum con_scroll dir, unsigned int lines); bool (*con_switch)(struct vc_data *vc); bool (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank, bool mode_switch); int (*con_font_set)(struct vc_data *vc, const struct console_font *font, unsigned int vpitch, unsigned int flags); int (*con_font_get)(struct vc_data *vc, struct console_font *font, unsigned int vpitch); int (*con_font_default)(struct vc_data *vc, struct console_font *font, const char *name); int (*con_resize)(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user); void (*con_set_palette)(struct vc_data *vc, const unsigned char *table); void (*con_scrolldelta)(struct vc_data *vc, int lines); bool (*con_set_origin)(struct vc_data *vc); void (*con_save_screen)(struct vc_data *vc); u8 (*con_build_attr)(struct vc_data *vc, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic); void (*con_invert_region)(struct vc_data *vc, u16 *p, int count); void (*con_debug_enter)(struct vc_data *vc); void (*con_debug_leave)(struct vc_data *vc); }; extern const struct consw *conswitchp; extern const struct consw dummy_con; /* dummy console buffer */ extern const struct consw vga_con; /* VGA text console */ extern const struct consw newport_con; /* SGI Newport console */ struct screen_info; #ifdef CONFIG_VGA_CONSOLE void vgacon_register_screen(struct screen_info *si); #else static inline void vgacon_register_screen(struct screen_info *si) { } #endif int con_is_bound(const struct consw *csw); int do_unregister_con_driver(const struct consw *csw); int do_take_over_console(const struct consw *sw, int first, int last, int deflt); void give_up_console(const struct consw *sw); #ifdef CONFIG_VT void con_debug_enter(struct vc_data *vc); void con_debug_leave(void); #else static inline void con_debug_enter(struct vc_data *vc) { } static inline void con_debug_leave(void) { } #endif /* * The interface for a console, or any other device that wants to capture * console messages (printer driver?) */ /** * enum cons_flags - General console flags * @CON_PRINTBUFFER: Used by newly registered consoles to avoid duplicate * output of messages that were already shown by boot * consoles or read by userspace via syslog() syscall. * @CON_CONSDEV: Indicates that the console driver is backing * /dev/console. * @CON_ENABLED: Indicates if a console is allowed to print records. If * false, the console also will not advance to later * records. * @CON_BOOT: Marks the console driver as early console driver which * is used during boot before the real driver becomes * available. It will be automatically unregistered * when the real console driver is registered unless * "keep_bootcon" parameter is used. * @CON_ANYTIME: A misnomed historical flag which tells the core code * that the legacy @console::write callback can be invoked * on a CPU which is marked OFFLINE. That is misleading as * it suggests that there is no contextual limit for * invoking the callback. The original motivation was * readiness of the per-CPU areas. * @CON_BRL: Indicates a braille device which is exempt from * receiving the printk spam for obvious reasons. * @CON_EXTENDED: The console supports the extended output format of * /dev/kmesg which requires a larger output buffer. * @CON_SUSPENDED: Indicates if a console is suspended. If true, the * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. */ enum cons_flags { CON_PRINTBUFFER = BIT(0), CON_CONSDEV = BIT(1), CON_ENABLED = BIT(2), CON_BOOT = BIT(3), CON_ANYTIME = BIT(4), CON_BRL = BIT(5), CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), }; /** * struct nbcon_state - console state for nbcon consoles * @atom: Compound of the state fields for atomic operations * * @req_prio: The priority of a handover request * @prio: The priority of the current owner * @unsafe: Console is busy in a non takeover region * @unsafe_takeover: A hostile takeover in an unsafe state happened in the * past. The console cannot be safe until re-initialized. * @cpu: The CPU on which the owner runs * * To be used for reading and preparing of the value stored in the nbcon * state variable @console::nbcon_state. * * The @prio and @req_prio fields are particularly important to allow * spin-waiting to timeout and give up without the risk of a waiter being * assigned the lock after giving up. */ struct nbcon_state { union { unsigned int atom; struct { unsigned int prio : 2; unsigned int req_prio : 2; unsigned int unsafe : 1; unsigned int unsafe_takeover : 1; unsigned int cpu : 24; }; }; }; /* * The nbcon_state struct is used to easily create and interpret values that * are stored in the @console::nbcon_state variable. Ensure this struct stays * within the size boundaries of the atomic variable's underlying type in * order to avoid any accidental truncation. */ static_assert(sizeof(struct nbcon_state) <= sizeof(int)); /** * enum nbcon_prio - console owner priority for nbcon consoles * @NBCON_PRIO_NONE: Unused * @NBCON_PRIO_NORMAL: Normal (non-emergency) usage * @NBCON_PRIO_EMERGENCY: Emergency output (WARN/OOPS...) * @NBCON_PRIO_PANIC: Panic output * @NBCON_PRIO_MAX: The number of priority levels * * A higher priority context can takeover the console when it is * in the safe state. The final attempt to flush consoles in panic() * can be allowed to do so even in an unsafe state (Hope and pray). */ enum nbcon_prio { NBCON_PRIO_NONE = 0, NBCON_PRIO_NORMAL, NBCON_PRIO_EMERGENCY, NBCON_PRIO_PANIC, NBCON_PRIO_MAX, }; struct console; struct printk_buffers; /** * struct nbcon_context - Context for console acquire/release * @console: The associated console * @spinwait_max_us: Limit for spin-wait acquire * @prio: Priority of the context * @allow_unsafe_takeover: Allow performing takeover even if unsafe. Can * be used only with NBCON_PRIO_PANIC @prio. It * might cause a system freeze when the console * is used later. * @backlog: Ringbuffer has pending records * @pbufs: Pointer to the text buffer for this context * @seq: The sequence number to print for this context */ struct nbcon_context { /* members set by caller */ struct console *console; unsigned int spinwait_max_us; enum nbcon_prio prio; unsigned int allow_unsafe_takeover : 1; /* members set by emit */ unsigned int backlog : 1; /* members set by acquire */ struct printk_buffers *pbufs; u64 seq; }; /** * struct nbcon_write_context - Context handed to the nbcon write callbacks * @ctxt: The core console context * @outbuf: Pointer to the text buffer for output * @len: Length to write * @unsafe_takeover: If a hostile takeover in an unsafe state has occurred */ struct nbcon_write_context { struct nbcon_context __private ctxt; char *outbuf; unsigned int len; bool unsafe_takeover; }; /** * struct console - The console descriptor structure * @name: The name of the console driver * @write: Legacy write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) * @setup: Callback for initializing the console (Optional) * @exit: Callback for teardown of the console (Optional) * @match: Callback for matching a console (Optional) * @flags: Console flags. See enum cons_flags * @index: Console index, e.g. port number * @cflag: TTY control mode flags * @ispeed: TTY input speed * @ospeed: TTY output speed * @seq: Sequence number of the next ringbuffer record to print * @dropped: Number of unreported dropped ringbuffer records * @data: Driver private data * @node: hlist node for the console list * * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @nbcon_device_ctxt: Context available for non-printing operations * @nbcon_prev_seq: Seq num the previous nbcon owner was assigned to print * @pbufs: Pointer to nbcon private buffer * @kthread: Printer kthread for this console * @rcuwait: RCU-safe wait object for @kthread waking * @irq_work: Defer @kthread waking to IRQ work context */ struct console { char name[16]; void (*write)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *co, char *s, unsigned int count); struct tty_driver *(*device)(struct console *co, int *index); void (*unblank)(void); int (*setup)(struct console *co, char *options); int (*exit)(struct console *co); int (*match)(struct console *co, char *name, int idx, char *options); short flags; short index; int cflag; uint ispeed; uint ospeed; u64 seq; unsigned long dropped; void *data; struct hlist_node node; /* nbcon console specific members */ /** * @write_atomic: * * NBCON callback to write out text in any context. (Optional) * * This callback is called with the console already acquired. However, * a higher priority context is allowed to take it over by default. * * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe() * around any code where the takeover is not safe, for example, when * manipulating the serial port registers. * * nbcon_enter_unsafe() will fail if the context has lost the console * ownership in the meantime. In this case, the callback is no longer * allowed to go forward. It must back out immediately and carefully. * The buffer content is also no longer trusted since it no longer * belongs to the context. * * The callback should allow the takeover whenever it is safe. It * increases the chance to see messages when the system is in trouble. * If the driver must reacquire ownership in order to finalize or * revert hardware changes, nbcon_reacquire_nobuf() can be used. * However, on reacquire the buffer content is no longer available. A * reacquire cannot be used to resume printing. * * The callback can be called from any context (including NMI). * Therefore it must avoid usage of any locking and instead rely * on the console ownership for synchronization. */ void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); /** * @write_thread: * * NBCON callback to write out text in task context. * * This callback must be called only in task context with both * device_lock() and the nbcon console acquired with * NBCON_PRIO_NORMAL. * * The same rules for console ownership verification and unsafe * sections handling applies as with write_atomic(). * * The console ownership handling is necessary for synchronization * against write_atomic() which is synchronized only via the context. * * The device_lock() provides the primary serialization for operations * on the device. It might be as relaxed (mutex)[*] or as tight * (disabled preemption and interrupts) as needed. It allows * the kthread to operate in the least restrictive mode[**]. * * [*] Standalone nbcon_context_try_acquire() is not safe with * the preemption enabled, see nbcon_owner_matches(). But it * can be safe when always called in the preemptive context * under the device_lock(). * * [**] The device_lock() makes sure that nbcon_context_try_acquire() * would never need to spin which is important especially with * PREEMPT_RT. */ void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); /** * @device_lock: * * NBCON callback to begin synchronization with driver code. * * Console drivers typically must deal with access to the hardware * via user input/output (such as an interactive login shell) and * output of kernel messages via printk() calls. This callback is * called by the printk-subsystem whenever it needs to synchronize * with hardware access by the driver. It should be implemented to * use whatever synchronization mechanism the driver is using for * itself (for example, the port lock for uart serial consoles). * * The callback is always called from task context. It may use any * synchronization method required by the driver. * * IMPORTANT: The callback MUST disable migration. The console driver * may be using a synchronization mechanism that already takes * care of this (such as spinlocks). Otherwise this function must * explicitly call migrate_disable(). * * The flags argument is provided as a convenience to the driver. It * will be passed again to device_unlock(). It can be ignored if the * driver does not need it. */ void (*device_lock)(struct console *con, unsigned long *flags); /** * @device_unlock: * * NBCON callback to finish synchronization with driver code. * * It is the counterpart to device_lock(). * * This callback is always called from task context. It must * appropriately re-enable migration (depending on how device_lock() * disabled migration). * * The flags argument is the value of the same variable that was * passed to device_lock(). */ void (*device_unlock)(struct console *con, unsigned long flags); atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct nbcon_context __private nbcon_device_ctxt; atomic_long_t __private nbcon_prev_seq; struct printk_buffers *pbufs; struct task_struct *kthread; struct rcuwait rcuwait; struct irq_work irq_work; }; #ifdef CONFIG_LOCKDEP extern void lockdep_assert_console_list_lock_held(void); #else static inline void lockdep_assert_console_list_lock_held(void) { } #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC extern bool console_srcu_read_lock_is_held(void); #else static inline bool console_srcu_read_lock_is_held(void) { return 1; } #endif extern int console_srcu_read_lock(void); extern void console_srcu_read_unlock(int cookie); extern void console_list_lock(void) __acquires(console_mutex); extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** * console_srcu_read_flags - Locklessly read flags of a possibly registered * console * @con: struct console pointer of console to read flags from * * Locklessly reading @con->flags provides a consistent read value because * there is at most one CPU modifying @con->flags and that CPU is using only * read-modify-write operations to do so. * * Requires console_srcu_read_lock to be held, which implies that @con might * be a registered console. The purpose of holding console_srcu_read_lock is * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED) * and that no exit/cleanup routines will run if the console is currently * undergoing unregistration. * * If the caller is holding the console_list_lock or it is _certain_ that * @con is not and will not become registered, the caller may read * @con->flags directly instead. * * Context: Any context. * Return: The current value of the @con->flags field. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified * for registered consoles with console_srcu_write_flags(). */ return data_race(READ_ONCE(con->flags)); } /** * console_srcu_write_flags - Write flags for a registered console * @con: struct console pointer of console to write flags to * @flags: new flags value to write * * Only use this function to write flags for registered consoles. It * requires holding the console_list_lock. * * Context: Any context. */ static inline void console_srcu_write_flags(struct console *con, short flags) { lockdep_assert_console_list_lock_held(); /* This matches the READ_ONCE() in console_srcu_read_flags(). */ WRITE_ONCE(con->flags, flags); } /* Variant of console_is_registered() when the console_list_lock is held. */ static inline bool console_is_registered_locked(const struct console *con) { lockdep_assert_console_list_lock_held(); return !hlist_unhashed(&con->node); } /* * console_is_registered - Check if the console is registered * @con: struct console pointer of console to check * * Context: Process context. May sleep while acquiring console list lock. * Return: true if the console is in the console list, otherwise false. * * If false is returned for a console that was previously registered, it * can be assumed that the console's unregistration is fully completed, * including the exit() callback after console list removal. */ static inline bool console_is_registered(const struct console *con) { bool ret; console_list_lock(); ret = console_is_registered_locked(con); console_list_unlock(); return ret; } /** * for_each_console_srcu() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * Although SRCU guarantees the console list will be consistent, the * struct console fields may be updated by other CPUs while iterating. * * Requires console_srcu_read_lock to be held. Can be invoked from * any context. */ #define for_each_console_srcu(con) \ hlist_for_each_entry_srcu(con, &console_list, node, \ console_srcu_read_lock_is_held()) /** * for_each_console() - Iterator over registered consoles * @con: struct console pointer used as loop cursor * * The console list and the &console.flags are immutable while iterating. * * Requires console_list_lock to be held. */ #define for_each_console(con) \ lockdep_assert_console_list_lock_held(); \ hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK extern void nbcon_cpu_emergency_enter(void); extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } #endif extern int console_set_on_cmdline; extern struct console *early_console; enum con_flush_mode { CONSOLE_FLUSH_PENDING, CONSOLE_REPLAY_ALL, }; extern int add_preferred_console(const char *name, const short idx, char *options); extern void console_force_preferred_locked(struct console *con); extern void register_console(struct console *); extern int unregister_console(struct console *); extern void console_lock(void); extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_suspend(struct console *); extern void console_resume(struct console *); extern int is_console_locked(void); extern int braille_register_console(struct console *, int index, char *console_options, char *braille_options); extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); #else static inline void console_sysfs_notify(void) { } #endif extern bool console_suspend_enabled; /* Suspend and resume console messages over PM events */ extern void console_suspend_all(void); extern void console_resume_all(void); int mda_console_init(void); void vcs_make_sysfs(int index); void vcs_remove_sysfs(int index); /* Some debug stub to catch some of the obvious races in the VT code */ #define WARN_CONSOLE_UNLOCKED() \ WARN_ON(!atomic_read(&ignore_console_lock_warning) && \ !is_console_locked() && !oops_in_progress) /* * Increment ignore_console_lock_warning if you need to quiet * WARN_CONSOLE_UNLOCKED() for debugging purposes. */ extern atomic_t ignore_console_lock_warning; extern void console_init(void); /* For deferred console takeover */ void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); #endif /* _LINUX_CONSOLE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Based on arch/arm/include/asm/traps.h * * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_TRAP_H #define __ASM_TRAP_H #include <linux/list.h> #include <asm/esr.h> #include <asm/ptrace.h> #include <asm/sections.h> #ifdef CONFIG_ARMV8_DEPRECATED bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn); #else static inline bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn) { return false; } #endif /* CONFIG_ARMV8_DEPRECATED */ void force_signal_inject(int signal, int code, unsigned long address, unsigned long err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey); void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); int early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs); /* * Move regs->pc to next instruction and do necessary setup before it * is executed. */ void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size); static inline int __in_irqentry_text(unsigned long ptr) { return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } static inline int in_entry_text(unsigned long ptr) { return ptr >= (unsigned long)&__entry_text_start && ptr < (unsigned long)&__entry_text_end; } /* * CPUs with the RAS extensions have an Implementation-Defined-Syndrome bit * to indicate whether this ESR has a RAS encoding. CPUs without this feature * have a ISS-Valid bit in the same position. * If this bit is set, we know its not a RAS SError. * If its clear, we need to know if the CPU supports RAS. Uncategorized RAS * errors share the same encoding as an all-zeros encoding from a CPU that * doesn't support RAS. */ static inline bool arm64_is_ras_serror(unsigned long esr) { WARN_ON(preemptible()); if (esr & ESR_ELx_IDS) return false; if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) return true; else return false; } /* * Return the AET bits from a RAS SError's ESR. * * It is implementation defined whether Uncategorized errors are containable. * We treat them as Uncontainable. * Non-RAS SError's are reported as Uncontained/Uncategorized. */ static inline unsigned long arm64_ras_serror_get_severity(unsigned long esr) { unsigned long aet = esr & ESR_ELx_AET; if (!arm64_is_ras_serror(esr)) { /* Not a RAS error, we can't interpret the ESR. */ return ESR_ELx_AET_UC; } /* * AET is RES0 if 'the value returned in the DFSC field is not * [ESR_ELx_FSC_SERROR]' */ if ((esr & ESR_ELx_FSC) != ESR_ELx_FSC_SERROR) { /* No severity information : Uncategorized */ return ESR_ELx_AET_UC; } return aet; } bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr); void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr); static inline void arm64_mops_reset_regs(struct user_pt_regs *regs, unsigned long esr) { bool wrong_option = esr & ESR_ELx_MOPS_ISS_WRONG_OPTION; bool option_a = esr & ESR_ELx_MOPS_ISS_OPTION_A; int dstreg = ESR_ELx_MOPS_ISS_DESTREG(esr); int srcreg = ESR_ELx_MOPS_ISS_SRCREG(esr); int sizereg = ESR_ELx_MOPS_ISS_SIZEREG(esr); unsigned long dst, size; dst = regs->regs[dstreg]; size = regs->regs[sizereg]; /* * Put the registers back in the original format suitable for a * prologue instruction, using the generic return routine from the * Arm ARM (DDI 0487I.a) rules CNTMJ and MWFQH. */ if (esr & ESR_ELx_MOPS_ISS_MEM_INST) { /* SET* instruction */ if (option_a ^ wrong_option) { /* Format is from Option A; forward set */ regs->regs[dstreg] = dst + size; regs->regs[sizereg] = -size; } } else { /* CPY* instruction */ unsigned long src = regs->regs[srcreg]; if (!(option_a ^ wrong_option)) { /* Format is from Option B */ if (regs->pstate & PSR_N_BIT) { /* Backward copy */ regs->regs[dstreg] = dst - size; regs->regs[srcreg] = src - size; } } else { /* Format is from Option A */ if (size & BIT(63)) { /* Forward copy */ regs->regs[dstreg] = dst + size; regs->regs[srcreg] = src + size; regs->regs[sizereg] = -size; } } } if (esr & ESR_ELx_MOPS_ISS_FROM_EPILOGUE) regs->pc -= 8; else regs->pc -= 4; } #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> #include "fib_lookup.h" #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ) #define DEFAULT_MIN_ADVMSS 256 static int ip_rt_max_size; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); return NULL; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos) return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { } static int rt_cache_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); return 0; } static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show, }; static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } (*pos)++; return NULL; } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) { } static int rt_cpu_seq_show(struct seq_file *seq, void *v) { struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, 0, /* st->gc_total */ 0, /* st->gc_ignored */ 0, /* st->gc_goal_miss */ 0, /* st->gc_dst_overflow */ 0, /* st->in_hlist_search */ 0 /* st->out_hlist_search */ ); return 0; } static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show, }; #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; unsigned int i, j; dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); if (!dst) return -ENOMEM; for_each_possible_cpu(i) { src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); for (j = 0; j < 256; j++) { dst[j].o_bytes += src[j].o_bytes; dst[j].o_packets += src[j].o_packets; dst[j].i_bytes += src[j].i_bytes; dst[j].i_packets += src[j].i_packets; } } seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); kfree(dst); return 0; } #endif static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; pde = proc_create_seq("rt_cache", 0444, net->proc_net, &rt_cache_seq_ops); if (!pde) goto err1; pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_ops); if (!pde) goto err2; #ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create_single("rt_acct", 0, net->proc_net, rt_acct_proc_show); if (!pde) goto err3; #endif return 0; #ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif err2: remove_proc_entry("rt_cache", net->proc_net); err1: return -ENOMEM; } static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); #ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, }; static int __init ip_rt_proc_init(void) { return register_pernet_subsys(&ip_rt_proc_ops); } #else static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ static inline bool rt_is_expired(const struct rtable *rth) { bool res; rcu_read_lock(); res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); rcu_read_unlock(); return res; } void rt_cache_flush(struct net *net) { rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; struct neighbour *n; rcu_read_lock(); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { n = ip_neigh_gw6(dev, &rt->rt_gw6); } else { __be32 pkey; pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); n = ip_neigh_gw4(dev, pkey); } if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { pkey = (const __be32 *)&rt->rt_gw4; } else if (rt->rt_gw_family == AF_INET6) { return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); } else if (!daddr || (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } /* Hash tables of size 2048..262144 depending on RAM size. * Each bucket uses 8 bytes. */ static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker * to infer how many packets were sent between two points in time. */ static u32 ip_idents_reserve(u32 hash, int segs) { u32 bucket, old, now = (u32)jiffies; atomic_t *p_id; u32 *p_tstamp; u32 delta = 0; bucket = hash & ip_idents_mask; p_tstamp = ip_tstamps + bucket; p_id = ip_idents + bucket; old = READ_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = get_random_u32_below(now - old); /* If UBSAN reports an error there, please make sure your compiler * supports -fno-strict-overflow before reporting it that was a bug * in UBSAN, and it has been fixed in GCC-8. */ return atomic_add_return(segs + delta, p_id) - segs; } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { u32 hash, id; /* Note the following code is not safe, but this is okay. */ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) get_random_bytes(&net->ipv4.ip_id_key, sizeof(net->ipv4.ip_id_key)); hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, int flow_flags) { __u8 scope = RT_SCOPE_UNIVERSE; if (sk) { oif = sk->sk_bound_dev_if; mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); } static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct sk_buff *skb) { if (skb) build_skb_flow_key(fl4, skb, sk); else build_sk_flow_key(fl4, sk); } static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { struct rtable *rt; rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } } static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception __rcu **fnhe_p, **oldest_p; struct fib_nh_exception *fnhe, *oldest = NULL; for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); if (!fnhe) break; if (!oldest || time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; oldest_p = fnhe_p; } } fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); } static u32 fnhe_hashfun(__be32 daddr) { static siphash_aligned_key_t fnhe_hash_key; u64 hval; net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } } static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, __be32 gw, u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; u32 genid, hval; unsigned int i; int depth; genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; depth = 0; for (fnhe = rcu_dereference(hash->chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) break; depth++; } if (fnhe) { if (fnhe->fnhe_genid != genid) fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; if (pmtu) { fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) fill_route_from_fnhe(rt, fnhe); rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) fill_route_from_fnhe(rt, fnhe); } else { /* Randomize max depth to avoid some side channels attacks. */ int max_depth = FNHE_RECLAIM_DEPTH + get_random_u32_below(FNHE_RECLAIM_DEPTH); while (depth > max_depth) { fnhe_remove_oldest(hash); depth--; } fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); if (!fnhe) goto out_unlock; fnhe->fnhe_next = hash->chain; fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); rcu_assign_pointer(hash->chain, fnhe); /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; for_each_possible_cpu(i) { struct rtable __rcu **prt; prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; } } fnhe->fnhe_stamp = jiffies; out_unlock: spin_unlock_bh(&fnhe_lock); } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct neighbour *n; struct net *net; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: break; default: return; } if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(READ_ONCE(n->nud_state) & NUD_VALID)) { neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, skb); nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) { const struct iphdr *iph = (const struct iphdr *) skb->data; __be32 daddr = iph->daddr; __be32 saddr = iph->saddr; net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); } #endif ; } static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); if ((dst->obsolete > 0) || (rt->rt_flags & RTCF_REDIRECTED) || rt->dst.expires) sk_dst_reset(sk); } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; struct net *net; int log_martians; int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; peer->n_redirects = 0; } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_unlock; } /* Check for load limit; set rate_last to the latest sent * redirect. */ if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } out_unlock: rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = skb->dev; struct in_device *in_dev; struct inet_peer *peer; unsigned long now; struct net *net; SKB_DR(reason); bool send; int code; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) goto out; } in_dev = __in_dev_get_rcu(dev); /* IP on this device is disabled. */ if (!in_dev) goto out; net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: SKB_DR_SET(reason, IP_INADDRERRORS); __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; } switch (rt->dst.error) { case EINVAL: default: goto out; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; break; case ENETUNREACH: code = ICMP_NET_UNREACH; SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; peer->rate_tokens += now - peer->rate_last; if (peer->rate_tokens > ip_rt_error_burst) peer->rate_tokens = ip_rt_error_burst; peer->rate_last = now; if (peer->rate_tokens >= ip_rt_error_cost) peer->rate_tokens -= ip_rt_error_cost; else send = false; } rcu_read_unlock(); if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb_reason(skb, reason); return 0; } static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct fib_result res; bool lock = false; struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) return; old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; rcu_read_lock(); net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); } if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) goto out; if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res.fi) > 1) { int nhsel; for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { nhc = fib_info_nhc(res.fi, nhsel); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } out: rcu_read_unlock(); } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); /* Don't make lookup fail for bridged encapsulations */ if (skb && netif_is_any_bridge_port(skb->dev)) fl4.flowi4_oif = 0; __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; struct net *net = sock_net(sk); bh_lock_sock(sk); if (!ip_sk_accept_pmtu(sk)) goto out; odst = sk_dst_get(sk); if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } if (new) sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { struct net_device *dev; struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. * Also check we have a reasonable ipv4 header. */ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; memset(&opt, 0, sizeof(opt)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) return; } __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); } static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; } /* * We do not cache source address of outgoing interface, * because it is used only by IP RR, TS and SRR options, * so that it out of fast path. * * BTW remember: "addr" is allowed to be not aligned * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; if (rt_is_output_route(rt)) src = ip_hdr(skb)->saddr; else { struct fib_result res; struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); } #ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) rt->dst.tclassid |= tag & 0xFFFF; if (!(rt->dst.tclassid & 0xFFFF0000)) rt->dst.tclassid |= tag & 0xFFFF0000; } #endif static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); unsigned int advmss; struct net *net; rcu_read_lock(); net = dev_net_rcu(dst->dev); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; u32 hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; fnhe_p = &hash->chain; fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); while (fnhe) { if (fnhe->fnhe_daddr == daddr) { rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); /* set fnhe_daddr to 0 to ensure it won't bind with * new dsts in rt_bind_exception(). */ fnhe->fnhe_daddr = 0; fnhe_flush_routes(fnhe); kfree_rcu(fnhe, rcu); break; } fnhe_p = &fnhe->fnhe_next; fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)); } spin_unlock_bh(&fnhe_lock); } static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; if (!hash) return NULL; hval = fnhe_hashfun(daddr); for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { ip_del_fnhe(nhc, daddr); break; } return fnhe; } } return NULL; } /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device */ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { struct fib_nh_common *nhc = res->nhc; struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; u32 mtu = 0; if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; if (likely(!mtu)) { struct fib_nh_exception *fnhe; fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { bool ret = false; spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { struct rtable __rcu **porig; struct rtable *orig; int genid = fnhe_genid(dev_net(rt->dst.dev)); if (rt_is_input_route(rt)) porig = &fnhe->fnhe_rth_input; else porig = &fnhe->fnhe_rth_output; orig = rcu_dereference(*porig); if (fnhe->fnhe_genid != genid) { fnhe->fnhe_genid = genid; fnhe->fnhe_gw = 0; fnhe->fnhe_pmtu = 0; fnhe->fnhe_expires = 0; fnhe->fnhe_mtu_locked = false; fnhe_flush_routes(fnhe); orig = NULL; } fill_route_from_fnhe(rt, fnhe); if (!rt->rt_gw4) { rt->rt_gw4 = daddr; rt->rt_gw_family = AF_INET; } if (do_cache) { dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); } ret = true; } fnhe->fnhe_stamp = jiffies; } spin_unlock_bh(&fnhe_lock); return ret; } static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nhc->nhc_rth_input; } else { p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; /* hold dst before doing cmpxchg() to avoid race condition * on this dst */ dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { dst_release(&rt->dst); ret = false; } return ret; } struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); void rt_add_uncached_list(struct rtable *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt_del_uncached_list(struct rtable *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void ipv4_dst_destroy(struct dst_entry *dst) { ip_dst_metrics_put(dst); rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) { struct rtable *rt, *safe; int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static bool rt_cache_valid(const struct rtable *rt) { return rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag, const bool do_cache) { bool cached = false; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) rt->rt_gw4 = nhc->nhc_gw.ipv4; else rt->rt_gw6 = nhc->nhc_gw.ipv6; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); rt->dst.tclassid = nh->nh_tclassid; } #endif rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ if (!rt->rt_gw4) { rt->rt_gw_family = AF_INET; rt->rt_gw4 = daddr; } rt_add_uncached_list(rt); } } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif } struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->rt_flags = flags; rt->rt_type = type; rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; } EXPORT_SYMBOL(rt_dst_alloc); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) { struct rtable *new_rt; new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, rt->dst.flags); if (new_rt) { new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); new_rt->rt_flags = rt->rt_flags; new_rt->rt_type = rt->rt_type; new_rt->rt_is_input = rt->rt_is_input; new_rt->rt_iif = rt->rt_iif; new_rt->rt_pmtu = rt->rt_pmtu; new_rt->rt_mtu_locked = rt->rt_mtu_locked; new_rt->rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) new_rt->rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; new_rt->dst.input = rt->dst.input; new_rt->dst.output = rt->dst.output; new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); } return new_rt; } EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) return SKB_DROP_REASON_NOT_SPECIFIED; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) return SKB_DROP_REASON_IP_INVALID_SOURCE; if (skb->protocol != htons(ETH_P_IP)) return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) return reason; } return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ static enum skb_drop_reason ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (reason) return reason; if (our) flags |= RTCF_LOCAL; if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; rth->rt_is_input= 1; #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return SKB_NOT_DROPPED_YET; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ pr_warn("martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), dev->hard_header_len, false); } } #endif } /* called in rcu_read_lock() section */ static enum skb_drop_reason __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; bool do_cache; u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); goto cleanup; } do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { __be32 gw; gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. * * Proxy arp feature have been extended to allow, ARP * replies back to the same interface, to support * Private VLAN switch technologies. See arp.c. */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; } } rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { reason = SKB_DROP_REASON_NOMEM; goto cleanup; } rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: reason = SKB_NOT_DROPPED_YET; cleanup: return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH /* To make ICMP packets follow the right flow, the multipath hash is * calculated from the inner IP addresses. */ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; if (likely(outer_iph->protocol != IPPROTO_ICMP)) goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; out: hash_keys->addrs.v4addrs.src = key_iph->saddr; hash_keys->addrs.v4addrs.dst = key_iph->daddr; } static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = fl4->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = fl4->fl4_sport; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash = 0; switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (skb) { ip_multipath_l3_keys(skb, &hash_keys); } else { hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); /* skb is currently provided only when forwarding */ if (skb) { struct flow_keys keys; skb_flow_dissect_flow_keys(skb, &keys, 0); /* Inner can be v4 or v6 */ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; hash_keys.tags.flow_label = keys.tags.flow_label; hash_keys.basic.ip_proto = keys.basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; ip_multipath_l3_keys(skb, &hash_keys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = fib_multipath_custom_hash_skb(net, skb); else mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); return mhash >> 1; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static enum skb_drop_reason ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif /* create a routing cache entry */ return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp); } /* Implements all the saddr-related checks as ip_route_input_slow(), * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ enum skb_drop_reason ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); u32 tag = 0; if (!in_dev) return reason; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &tag); if (reason) goto martian_source; skip_validate_source: skb_dst_copy(skb, hint); return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); return reason; } /* get device for dst_alloc with local routes */ static struct net_device *ip_rt_get_dev(struct net *net, const struct fib_result *res) { struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; struct net_device *dev = NULL; if (nhc) dev = l3mdev_master_dev_rcu(nhc->nhc_dev); return dev ? : net->loopback_dev; } /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * Changes in the enforced policies must be applied also to * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. * called with rcu_read_lock() */ static enum skb_drop_reason ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; struct flowi4 fl4; bool do_cache = true; /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected * by fib_lookup. */ tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } res->fi = NULL; res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(daddr)) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; } } else if (ipv4_is_loopback(saddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } } /* * Now we are ready to route packet. */ fl4.flowi4_l3mdev = 0; fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; } else { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; /* not do cache if bc_forwarding is enabled */ if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING)) do_cache = false; goto brd_input; } err = -EINVAL; if (res->type == RTN_LOCAL) { reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) { err = -EHOSTUNREACH; goto no_route; } if (res->type != RTN_UNICAST) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } make_route: reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); out: return reason; brd_input: if (skb->protocol != htons(ETH_P_IP)) { reason = SKB_DROP_REASON_INVALID_PROTO; goto out; } if (!ipv4_is_zeronet(saddr)) { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; } flags |= RTCF_BROADCAST; res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; } } rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; no_route: RT_CACHE_STAT_INC(in_no_route); res->type = RTN_UNREACHABLE; res->fi = NULL; res->table = NULL; goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif goto out; e_nobufs: reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } /* called with rcu_read_lock held */ static enum skb_drop_reason ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing * hardware multicast filters :-( As result the host on multicasting * network acquires a lot of useless route cache entries, sort of * SDR messages from all the world. Now we try to get rid of them. * Really, provided software IP multicast filter is organized * reasonably (at least, hashed), it does not result in a slowdown * comparing with route cache reject entries. * Note, that multicast routers are not affected, because * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; if (!in_dev) return reason; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); /* check l3 master if no match yet */ if (!our && netif_is_l3_slave(dev)) { struct in_device *l3_in_dev; l3_in_dev = __in_dev_get_rcu(skb->dev); if (l3_in_dev) our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, ip_hdr(skb)->protocol); } if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { reason = ip_route_input_mc(skb, daddr, saddr, dscp, dev, our); } return reason; } return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; rcu_read_lock(); reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); return reason; } EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) type = RTN_BROADCAST; else if (ipv4_is_multicast(fl4->daddr)) type = RTN_MULTICAST; else if (ipv4_is_zeronet(fl4->daddr)) return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; else do_cache = false; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ if (fi && res->prefixlen < 4) fi = NULL; } else if ((type == RTN_LOCAL) && (orig_oif != 0) && (orig_oif != dev_out->ifindex)) { /* For local routes that require a particular output interface * we do not want to cache the result. Caching the result * causes incorrect behaviour when there are multiple source * addresses on the interface, the end result being that if the * intended recipient is waiting on that interface for the * packet he won't receive it because it will be delivered on * the loopback interface and the IP_PKTINFO ipi_ifindex will * be set to the loopback interface as well. */ do_cache = false; } fnhe = NULL; do_cache &= fi != NULL; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { prth = &fnhe->fnhe_rth_output; } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; } add: rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; RT_CACHE_STAT_INC(out_slow_tot); if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } } #endif } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); lwtunnel_set_redirect(&rth->dst); return rth; } /* * Major route resolver routine. */ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { struct fib_result res = { .type = RTN_UNSPEC, .fi = NULL, .table = NULL, .tclassid = 0, }; struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); rcu_read_unlock(); return rth; } EXPORT_SYMBOL_GPL(ip_route_output_key_hash); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) { struct net_device *dev_out = NULL; int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; int err; if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr) || ipv4_is_zeronet(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. * It was wrong for two reasons: * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr * is assigned to multiple interfaces. * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && (ipv4_is_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); if (!dev_out) goto out; /* Special hack: user can direct multicasts * and limited broadcast via necessary interface * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. * This hack is not just for fun, it allows * vic,vat and friends to work. * They bind socket to loopback, set ttl to zero * and expect that it will work. * From the viewpoint of routing cache they are broken, * because we are not allowed to build multicast path * with loopback source addr (look, routing cache * cannot know, that ttl is zero, so that packet * will not leave this host and route is valid). * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; goto make_route; } if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (!dev_out) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); goto out; } if (ipv4_is_local_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr) || fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); else if (!fl4->daddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl4->daddr) { fl4->daddr = fl4->saddr; if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } err = fib_lookup(net, fl4, res, 0); if (err) { res->fi = NULL; res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) { /* Apparently, routing tables are wrong. Assume, * that the destination is on link. * * WHY? DW. * Because we are allowed to send to iface * even if it has NO routes and NO assigned * addresses. When oif is specified, routing * tables are looked up with only one purpose: * to catch if destination is gatewayed, rather than * direct. Moreover, if MSG_DONTROUTE is set, * we send packet, ignoring both routing tables * and ifaddr state. --ANK * * * We could make it even if oif is unknown, * likely IPv6, but we do not. */ if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } if (res->type == RTN_LOCAL) { if (!fl4->saddr) { if (res->fi->fib_prefsrc) fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; /* make sure orig_oif points to fib result device even * though packet rx/tx happens over loopback or l3mdev */ orig_oif = FIB_RES_OIF(*res); fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: return rth; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .default_advmss = ipv4_default_advmss, .neigh_lookup = ipv4_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; new->dev = net->loopback_dev; netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; else if (rt->rt_gw_family == AF_INET6) rt->rt_gw6 = ort->rt_gw6; } dst_release(dst_orig); return rt ? &rt->dst : ERR_PTR(-ENOMEM); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0)); } return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, dscp_t dscp, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; u32 error; u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = inet_dscp_to_dsfield(dscp); r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; if (rt->dst.lwtstate && lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway) { if (rt->rt_gw_family == AF_INET && nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; } else if (rt->rt_gw_family == AF_INET6) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &rt->rt_gw6, alen); } } expires = rt->dst.expires; if (expires) { unsigned long now = jiffies; if (time_before(now, expires)) expires -= now; else expires = 0; } memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rt->rt_mtu_locked && expires) metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (fl4) { if (fl4->flowi4_mark && nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && nla_put_u32(skb, RTA_UID, from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) goto nla_put_failure; if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, r, portid); if (err <= 0) { if (err == 0) return 0; goto nla_put_failure; } } else #endif if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) goto nla_put_failure; } } error = rt->dst.error; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, int *fa_index, int fa_start, unsigned int flags) { int i; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference(bucket[i].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { struct rtable *rt; int err; if (*fa_index < fa_start) goto next; if (fnhe->fnhe_genid != genid) goto next; if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) goto next; rt = rcu_dereference(fnhe->fnhe_rth_input); if (!rt) rt = rcu_dereference(fnhe->fnhe_rth_output); if (!rt) goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, 0, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) return err; next: (*fa_index)++; } } return 0; } int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); struct fnhe_hash_bucket *bucket; int err; if (nhc->nhc_flags & RTNH_F_DEAD) continue; rcu_read_lock(); bucket = rcu_dereference(nhc->nhc_exceptions); err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, genid, fa_index, fa_start, flags); rcu_read_unlock(); if (err) return err; } return 0; } static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) { struct sk_buff *skb; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; /* Reserve room for dummy headers, this skb can pass * through good chunk of routing engine. */ skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = ip_proto; iph->saddr = src; iph->daddr = dst; iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; skb_set_transport_header(skb, skb->len); switch (iph->protocol) { case IPPROTO_UDP: { struct udphdr *udph; udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } case IPPROTO_TCP: { struct tcphdr *tcph; tcph = skb_put_zero(skb, sizeof(struct tcphdr)); tcph->source = sport; tcph->dest = dport; tcph->doff = sizeof(struct tcphdr) / 4; tcph->rst = 1; tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), src, dst, 0); break; } case IPPROTO_ICMP: { struct icmphdr *icmph; icmph = skb_put_zero(skb, sizeof(struct icmphdr)); icmph->type = ICMP_ECHO; icmph->code = 0; } } return skb; } static int inet_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_NOTIFY | RTM_F_LOOKUP_TABLE | RTM_F_FIB_MATCH)) { NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_IIF: case RTA_OIF: case RTA_SRC: case RTA_DST: case RTA_IP_PROTO: case RTA_SPORT: case RTA_DPORT: case RTA_MARK: case RTA_UID: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); return -EINVAL; } } return 0; } static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; u32 table_id = RT_TABLE_MAIN; __be16 sport = 0, dport = 0; struct fib_result res = {}; u8 ip_proto = IPPROTO_UDP; struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; dscp_t dscp; kuid_t uid; u32 iif; int err; int mark; err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); src = nla_get_in_addr_default(tb[RTA_SRC], 0); dst = nla_get_in_addr_default(tb[RTA_DST], 0); iif = nla_get_u32_default(tb[RTA_IIF], 0); mark = nla_get_u32_default(tb[RTA_MARK], 0); dscp = inet_dsfield_to_dscp(rtm->rtm_tos); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else uid = (iif ? INVALID_UID : current_uid()); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &ip_proto, AF_INET, extack); if (err) return err; } if (tb[RTA_SPORT]) sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) dport = nla_get_be16(tb[RTA_DPORT]); skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); if (!skb) return -ENOBUFS; fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) fl4.fl4_sport = sport; if (dport) fl4.fl4_dport = dport; fl4.flowi4_proto = ip_proto; rcu_read_lock(); if (iif) { struct net_device *dev; dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_rcu; } fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, dscp, dev, &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); else skb_dst_set(skb, &rt->dst); } if (err) goto errout_rcu; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = res.table ? res.table->tb_id : 0; /* reset skb for netlink reply msg */ skb_trim(skb, 0); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { struct fib_rt_info fri; if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } fri.fi = res.fi; fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; fri.dscp = res.dscp; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { u8 slen = 32 - fri.dst_len; if (fa->fa_slen == slen && fa->tb_id == fri.tb_id && fa->fa_dscp == fri.dscp && fa->fa_info == res.fi && fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); break; } } } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: return err; errout_rcu: rcu_read_unlock(); kfree_skb(skb); goto errout_free; } void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; if (write) { rt_cache_flush(net); fnhe_genid_bump(net); return 0; } return -EINVAL; } static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static const char ipv4_route_flush_procname[] = "flush"; static struct ctl_table ipv4_route_netns_table[] = { { .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, }, { .procname = "min_pmtu", .data = &init_net.ipv4.ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &ip_min_valid_pmtu, }, { .procname = "mtu_expires", .data = &init_net.ipv4.ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv4.ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; size_t table_size = ARRAY_SIZE(ipv4_route_netns_table); tbl = ipv4_route_netns_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL); if (!tbl) goto err_dup; /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; } /* Update the variables to point into the current struct net * except for the first element flush */ for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route", tbl, table_size); if (!net->ipv4.route_hdr) goto err_reg; return 0; err_reg: if (tbl != ipv4_route_netns_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_route_net_exit(struct net *net) { const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); BUG_ON(tbl == ipv4_route_netns_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, }; #endif static __net_init int netns_ip_rt_init(struct net *net) { /* Set default value for namespaceified sysctls */ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU; net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES; net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS; return 0; } static struct pernet_operations __net_initdata ip_rt_ops = { .init = netns_ip_rt_init, }; static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; static int __net_init ipv4_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv4.peers = bp; return 0; } static void __net_exit ipv4_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv4.peers; net->ipv4.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, }; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip_rt_init(void) { void *idents_hash; int cpu; /* For modern hosts, this will use 2 MB of memory */ idents_hash = alloc_large_system_hash("IP idents", sizeof(*ip_idents) + sizeof(*ip_tstamps), 0, 16, /* one bucket per 64 KB */ HASH_ZERO, NULL, &ip_idents_mask, 2048, 256*1024); ip_idents = idents_hash; get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable, SLAB_HWCACHE_ALIGN | SLAB_PANIC); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; if (dst_entries_init(&ipv4_dst_ops) < 0) panic("IP: failed to allocate ipv4_dst_ops counter\n"); if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); ipv4_dst_ops.gc_thresh = ~0; ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif rtnl_register_many(ip_rt_rtnl_msg_handlers); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&ip_rt_ops); register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); return 0; } #ifdef CONFIG_SYSCTL /* * We really need to sanitize the damn ipv4 init order, then all * this nonsense will go away. */ void __init ip_static_sysctl_init(void) { register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __DSA_USER_H #define __DSA_USER_H #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/list.h> #include <linux/netpoll.h> #include <linux/types.h> #include <net/dsa.h> #include <net/gro_cells.h> struct net_device; struct netlink_ext_ack; extern struct notifier_block dsa_user_switchdev_notifier; extern struct notifier_block dsa_user_switchdev_blocking_notifier; struct dsa_user_priv { /* Copy of CPU port xmit for faster access in user transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); struct gro_cells gcells; /* DSA port data, such as switch, port index, etc. */ struct dsa_port *dp; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif /* TC context */ struct list_head mall_tc_list; }; void dsa_user_mii_bus_init(struct dsa_switch *ds); int dsa_user_create(struct dsa_port *dp); void dsa_user_destroy(struct net_device *user_dev); int dsa_user_suspend(struct net_device *user_dev); int dsa_user_resume(struct net_device *user_dev); int dsa_user_register_notifier(void); void dsa_user_unregister_notifier(void); int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr); void dsa_user_host_uc_uninstall(struct net_device *dev); void dsa_user_sync_ha(struct net_device *dev); void dsa_user_unsync_ha(struct net_device *dev); void dsa_user_setup_tagger(struct net_device *user); int dsa_user_change_mtu(struct net_device *dev, int new_mtu); int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit, struct netlink_ext_ack *extack); int dsa_user_manage_vlan_filtering(struct net_device *dev, bool vlan_filtering); static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); return p->dp; } static inline struct net_device * dsa_user_to_conduit(const struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); return dsa_port_to_conduit(dp); } #endif
19 19 19 19 19 19 19 19 19 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/irq_work.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/sched.h> #include <linux/tick.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/smpboot.h> #include <asm/processor.h> #include <linux/kasan.h> #include <trace/events/ipi.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); static DEFINE_PER_CPU(struct task_struct *, irq_workd); static void wake_irq_workd(void) { struct task_struct *tsk = __this_cpu_read(irq_workd); if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) wake_up_process(tsk); } #ifdef CONFIG_SMP static void irq_work_wake(struct irq_work *entry) { wake_irq_workd(); } static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = IRQ_WORK_INIT_HARD(irq_work_wake); #endif static int irq_workd_should_run(unsigned int cpu) { return !llist_empty(this_cpu_ptr(&lazy_list)); } /* * Claim the entry so that no one else will poke at it. */ static bool irq_work_claim(struct irq_work *work) { int oflags; oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) return false; return true; } void __weak arch_irq_work_raise(void) { /* * Lame architectures will get the timer tick callback */ } static __always_inline void irq_work_raise(struct irq_work *work) { if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); arch_irq_work_raise(); } /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { struct llist_head *list; bool rt_lazy_work = false; bool lazy_work = false; int work_flags; work_flags = atomic_read(&work->node.a_flags); if (work_flags & IRQ_WORK_LAZY) lazy_work = true; else if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(work_flags & IRQ_WORK_HARD_IRQ)) rt_lazy_work = true; if (lazy_work || rt_lazy_work) list = this_cpu_ptr(&lazy_list); else list = this_cpu_ptr(&raised_list); if (!llist_add(&work->node.llist, list)) return; /* If the work is "lazy", handle it from next tick if any */ if (!lazy_work || tick_nohz_tick_stopped()) irq_work_raise(work); } /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); preempt_enable(); return true; } EXPORT_SYMBOL_GPL(irq_work_queue); /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ bool irq_work_queue_on(struct irq_work *work, int cpu) { #ifndef CONFIG_SMP return irq_work_queue(work); #else /* CONFIG_SMP: */ /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); /* * On PREEMPT_RT the items which are not marked as * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work * item is used on the remote CPU to wake the thread. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) goto out; work = &per_cpu(irq_work_wakeup, cpu); if (!irq_work_claim(work)) goto out; } __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); if (llist_empty(raised) || arch_irq_work_has_interrupt()) if (llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); return true; } void irq_work_single(void *arg) { struct irq_work *work = arg; int flags; /* * Clear the PENDING bit, after this point the @work can be re-used. * The PENDING bit acts as a lock, and we own it, so we can clear it * without atomic ops. */ flags = atomic_read(&work->node.a_flags); flags &= ~IRQ_WORK_PENDING; atomic_set(&work->node.a_flags, flags); /* * See irq_work_claim(). */ smp_mb(); lockdep_irq_work_enter(flags); work->func(work); lockdep_irq_work_exit(flags); /* * Clear the BUSY bit, if set, and return to the free state if no-one * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; /* * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed * in a per-CPU thread in preemptible context. Only the items which are * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. */ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } /* * hotplug calls this through: * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); void irq_work_tick(void) { struct llist_head *raised = this_cpu_ptr(&raised_list); if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. */ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); might_sleep(); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); return; } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { irq_work_run_list(this_cpu_ptr(&lazy_list)); } static void irq_workd_setup(unsigned int cpu) { sched_set_fifo_low(current); } static struct smp_hotplug_thread irqwork_threads = { .store = &irq_workd, .setup = irq_workd_setup, .thread_should_run = irq_workd_should_run, .thread_fn = run_irq_workd, .thread_comm = "irq_work/%u", }; static __init int irq_work_init_threads(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); return 0; } early_initcall(irq_work_init_threads);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */
1 16 43 40 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 /* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_HANDLE_EXIT_ARM64_KVM_H #include <linux/tracepoint.h> #include "sys_regs.h" #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm TRACE_EVENT(kvm_wfx_arm64, TP_PROTO(unsigned long vcpu_pc, bool is_wfe), TP_ARGS(vcpu_pc, is_wfe), TP_STRUCT__entry( __field(unsigned long, vcpu_pc) __field(bool, is_wfe) ), TP_fast_assign( __entry->vcpu_pc = vcpu_pc; __entry->is_wfe = is_wfe; ), TP_printk("guest executed wf%c at: 0x%016lx", __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) ); TRACE_EVENT(kvm_hvc_arm64, TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), TP_ARGS(vcpu_pc, r0, imm), TP_STRUCT__entry( __field(unsigned long, vcpu_pc) __field(unsigned long, r0) __field(unsigned long, imm) ), TP_fast_assign( __entry->vcpu_pc = vcpu_pc; __entry->r0 = r0; __entry->imm = imm; ), TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)", __entry->vcpu_pc, __entry->r0, __entry->imm) ); /* * The dreg32 name is a leftover from a distant past. This will really * output a 64bit value... */ TRACE_EVENT(kvm_arm_set_dreg32, TP_PROTO(const char *name, __u64 value), TP_ARGS(name, value), TP_STRUCT__entry( __field(const char *, name) __field(__u64, value) ), TP_fast_assign( __entry->name = name; __entry->value = value; ), TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); TRACE_EVENT(kvm_handle_sys_reg, TP_PROTO(unsigned long hsr), TP_ARGS(hsr), TP_STRUCT__entry( __field(unsigned long, hsr) ), TP_fast_assign( __entry->hsr = hsr; ), TP_printk("HSR 0x%08lx", __entry->hsr) ); TRACE_EVENT(kvm_sys_access, TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg), TP_ARGS(vcpu_pc, params, reg), TP_STRUCT__entry( __field(unsigned long, vcpu_pc) __field(bool, is_write) __field(const char *, name) __field(u8, Op0) __field(u8, Op1) __field(u8, CRn) __field(u8, CRm) __field(u8, Op2) ), TP_fast_assign( __entry->vcpu_pc = vcpu_pc; __entry->is_write = params->is_write; __entry->name = reg->name; __entry->Op0 = reg->Op0; __entry->Op0 = reg->Op0; __entry->Op1 = reg->Op1; __entry->CRn = reg->CRn; __entry->CRm = reg->CRm; __entry->Op2 = reg->Op2; ), TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s", __entry->vcpu_pc, __entry->name ?: "UNKN", __entry->Op0, __entry->Op1, __entry->CRn, __entry->CRm, __entry->Op2, __entry->is_write ? "write" : "read") ); TRACE_EVENT(kvm_set_guest_debug, TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), TP_ARGS(vcpu, guest_debug), TP_STRUCT__entry( __field(struct kvm_vcpu *, vcpu) __field(__u32, guest_debug) ), TP_fast_assign( __entry->vcpu = vcpu; __entry->guest_debug = guest_debug; ), TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) ); #endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace_handle_exit /* This part must be outside protection */ #include <trace/define_trace.h>
231 231 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2013 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #ifndef __ASM_ESR_H #define __ASM_ESR_H #include <asm/memory.h> #include <asm/sysreg.h> #define ESR_ELx_EC_UNKNOWN UL(0x00) #define ESR_ELx_EC_WFx UL(0x01) /* Unallocated EC: 0x02 */ #define ESR_ELx_EC_CP15_32 UL(0x03) #define ESR_ELx_EC_CP15_64 UL(0x04) #define ESR_ELx_EC_CP14_MR UL(0x05) #define ESR_ELx_EC_CP14_LS UL(0x06) #define ESR_ELx_EC_FP_ASIMD UL(0x07) #define ESR_ELx_EC_CP10_ID UL(0x08) /* EL2 only */ #define ESR_ELx_EC_PAC UL(0x09) /* EL2 and above */ #define ESR_ELx_EC_OTHER UL(0x0A) /* Unallocated EC: 0x0B */ #define ESR_ELx_EC_CP14_64 UL(0x0C) #define ESR_ELx_EC_BTI UL(0x0D) #define ESR_ELx_EC_ILL UL(0x0E) /* Unallocated EC: 0x0F - 0x10 */ #define ESR_ELx_EC_SVC32 UL(0x11) #define ESR_ELx_EC_HVC32 UL(0x12) /* EL2 only */ #define ESR_ELx_EC_SMC32 UL(0x13) /* EL2 and above */ /* Unallocated EC: 0x14 */ #define ESR_ELx_EC_SVC64 UL(0x15) #define ESR_ELx_EC_HVC64 UL(0x16) /* EL2 and above */ #define ESR_ELx_EC_SMC64 UL(0x17) /* EL2 and above */ #define ESR_ELx_EC_SYS64 UL(0x18) #define ESR_ELx_EC_SVE UL(0x19) #define ESR_ELx_EC_ERET UL(0x1a) /* EL2 only */ /* Unallocated EC: 0x1B */ #define ESR_ELx_EC_FPAC UL(0x1C) /* EL1 and above */ #define ESR_ELx_EC_SME UL(0x1D) /* Unallocated EC: 0x1E */ #define ESR_ELx_EC_IMP_DEF UL(0x1f) /* EL3 only */ #define ESR_ELx_EC_IABT_LOW UL(0x20) #define ESR_ELx_EC_IABT_CUR UL(0x21) #define ESR_ELx_EC_PC_ALIGN UL(0x22) /* Unallocated EC: 0x23 */ #define ESR_ELx_EC_DABT_LOW UL(0x24) #define ESR_ELx_EC_DABT_CUR UL(0x25) #define ESR_ELx_EC_SP_ALIGN UL(0x26) #define ESR_ELx_EC_MOPS UL(0x27) #define ESR_ELx_EC_FP_EXC32 UL(0x28) /* Unallocated EC: 0x29 - 0x2B */ #define ESR_ELx_EC_FP_EXC64 UL(0x2C) #define ESR_ELx_EC_GCS UL(0x2D) /* Unallocated EC: 0x2E */ #define ESR_ELx_EC_SERROR UL(0x2F) #define ESR_ELx_EC_BREAKPT_LOW UL(0x30) #define ESR_ELx_EC_BREAKPT_CUR UL(0x31) #define ESR_ELx_EC_SOFTSTP_LOW UL(0x32) #define ESR_ELx_EC_SOFTSTP_CUR UL(0x33) #define ESR_ELx_EC_WATCHPT_LOW UL(0x34) #define ESR_ELx_EC_WATCHPT_CUR UL(0x35) /* Unallocated EC: 0x36 - 0x37 */ #define ESR_ELx_EC_BKPT32 UL(0x38) /* Unallocated EC: 0x39 */ #define ESR_ELx_EC_VECTOR32 UL(0x3A) /* EL2 only */ /* Unallocated EC: 0x3B */ #define ESR_ELx_EC_BRK64 UL(0x3C) /* Unallocated EC: 0x3D - 0x3F */ #define ESR_ELx_EC_MAX UL(0x3F) #define ESR_ELx_EC_SHIFT (26) #define ESR_ELx_EC_WIDTH (6) #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) #define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) #define ESR_ELx_IL_SHIFT (25) #define ESR_ELx_IL (UL(1) << ESR_ELx_IL_SHIFT) #define ESR_ELx_ISS_MASK (GENMASK(24, 0)) #define ESR_ELx_ISS(esr) ((esr) & ESR_ELx_ISS_MASK) #define ESR_ELx_ISS2_SHIFT (32) #define ESR_ELx_ISS2_MASK (GENMASK_ULL(55, 32)) #define ESR_ELx_ISS2(esr) (((esr) & ESR_ELx_ISS2_MASK) >> ESR_ELx_ISS2_SHIFT) /* ISS field definitions shared by different classes */ #define ESR_ELx_WNR_SHIFT (6) #define ESR_ELx_WNR (UL(1) << ESR_ELx_WNR_SHIFT) /* Asynchronous Error Type */ #define ESR_ELx_IDS_SHIFT (24) #define ESR_ELx_IDS (UL(1) << ESR_ELx_IDS_SHIFT) #define ESR_ELx_AET_SHIFT (10) #define ESR_ELx_AET (UL(0x7) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UC (UL(0) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UEU (UL(1) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UEO (UL(2) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_UER (UL(3) << ESR_ELx_AET_SHIFT) #define ESR_ELx_AET_CE (UL(6) << ESR_ELx_AET_SHIFT) /* Shared ISS field definitions for Data/Instruction aborts */ #define ESR_ELx_VNCR_SHIFT (13) #define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_SET_SHIFT (11) #define ESR_ELx_SET_MASK (UL(3) << ESR_ELx_SET_SHIFT) #define ESR_ELx_FnV_SHIFT (10) #define ESR_ELx_FnV (UL(1) << ESR_ELx_FnV_SHIFT) #define ESR_ELx_EA_SHIFT (9) #define ESR_ELx_EA (UL(1) << ESR_ELx_EA_SHIFT) #define ESR_ELx_S1PTW_SHIFT (7) #define ESR_ELx_S1PTW (UL(1) << ESR_ELx_S1PTW_SHIFT) /* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */ #define ESR_ELx_FSC (0x3F) #define ESR_ELx_FSC_TYPE (0x3C) #define ESR_ELx_FSC_LEVEL (0x03) #define ESR_ELx_FSC_EXTABT (0x10) #define ESR_ELx_FSC_MTE (0x11) #define ESR_ELx_FSC_SERROR (0x11) #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) #define ESR_ELx_FSC_SEA_TTW(n) (0x14 + (n)) #define ESR_ELx_FSC_SECC (0x18) #define ESR_ELx_FSC_SECC_TTW(n) (0x1c + (n)) #define ESR_ELx_FSC_ADDRSZ (0x00) /* * Annoyingly, the negative levels for Address size faults aren't laid out * contiguously (or in the desired order) */ #define ESR_ELx_FSC_ADDRSZ_nL(n) ((n) == -1 ? 0x25 : 0x2C) #define ESR_ELx_FSC_ADDRSZ_L(n) ((n) < 0 ? ESR_ELx_FSC_ADDRSZ_nL(n) : \ (ESR_ELx_FSC_ADDRSZ + (n))) /* Status codes for individual page table levels */ #define ESR_ELx_FSC_ACCESS_L(n) (ESR_ELx_FSC_ACCESS + (n)) #define ESR_ELx_FSC_PERM_L(n) (ESR_ELx_FSC_PERM + (n)) #define ESR_ELx_FSC_FAULT_nL (0x2C) #define ESR_ELx_FSC_FAULT_L(n) (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \ ESR_ELx_FSC_FAULT) + (n)) /* ISS field definitions for Data Aborts */ #define ESR_ELx_ISV_SHIFT (24) #define ESR_ELx_ISV (UL(1) << ESR_ELx_ISV_SHIFT) #define ESR_ELx_SAS_SHIFT (22) #define ESR_ELx_SAS (UL(3) << ESR_ELx_SAS_SHIFT) #define ESR_ELx_SSE_SHIFT (21) #define ESR_ELx_SSE (UL(1) << ESR_ELx_SSE_SHIFT) #define ESR_ELx_SRT_SHIFT (16) #define ESR_ELx_SRT_MASK (UL(0x1F) << ESR_ELx_SRT_SHIFT) #define ESR_ELx_SF_SHIFT (15) #define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) #define ESR_ELx_AR_SHIFT (14) #define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) #define ESR_ELx_CM_SHIFT (8) #define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) /* ISS2 field definitions for Data Aborts */ #define ESR_ELx_TnD_SHIFT (10) #define ESR_ELx_TnD (UL(1) << ESR_ELx_TnD_SHIFT) #define ESR_ELx_TagAccess_SHIFT (9) #define ESR_ELx_TagAccess (UL(1) << ESR_ELx_TagAccess_SHIFT) #define ESR_ELx_GCS_SHIFT (8) #define ESR_ELx_GCS (UL(1) << ESR_ELx_GCS_SHIFT) #define ESR_ELx_Overlay_SHIFT (6) #define ESR_ELx_Overlay (UL(1) << ESR_ELx_Overlay_SHIFT) #define ESR_ELx_DirtyBit_SHIFT (5) #define ESR_ELx_DirtyBit (UL(1) << ESR_ELx_DirtyBit_SHIFT) #define ESR_ELx_Xs_SHIFT (0) #define ESR_ELx_Xs_MASK (GENMASK_ULL(4, 0)) /* ISS field definitions for exceptions taken in to Hyp */ #define ESR_ELx_CV (UL(1) << 24) #define ESR_ELx_COND_SHIFT (20) #define ESR_ELx_COND_MASK (UL(0xF) << ESR_ELx_COND_SHIFT) #define ESR_ELx_WFx_ISS_RN (UL(0x1F) << 5) #define ESR_ELx_WFx_ISS_RV (UL(1) << 2) #define ESR_ELx_WFx_ISS_TI (UL(3) << 0) #define ESR_ELx_WFx_ISS_WFxT (UL(2) << 0) #define ESR_ELx_WFx_ISS_WFI (UL(0) << 0) #define ESR_ELx_WFx_ISS_WFE (UL(1) << 0) #define ESR_ELx_xVC_IMM_MASK ((UL(1) << 16) - 1) /* ISS definitions for LD64B/ST64B/{T,P}SBCSYNC instructions */ #define ESR_ELx_ISS_OTHER_ST64BV (0) #define ESR_ELx_ISS_OTHER_ST64BV0 (1) #define ESR_ELx_ISS_OTHER_LDST64B (2) #define ESR_ELx_ISS_OTHER_TSBCSYNC (3) #define ESR_ELx_ISS_OTHER_PSBCSYNC (4) #define DISR_EL1_IDS (UL(1) << 24) /* * DISR_EL1 and ESR_ELx share the bottom 13 bits, but the RES0 bits may mean * different things in the future... */ #define DISR_EL1_ESR_MASK (ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC) /* ESR value templates for specific events */ #define ESR_ELx_WFx_MASK (ESR_ELx_EC_MASK | \ (ESR_ELx_WFx_ISS_TI & ~ESR_ELx_WFx_ISS_WFxT)) #define ESR_ELx_WFx_WFI_VAL ((ESR_ELx_EC_WFx << ESR_ELx_EC_SHIFT) | \ ESR_ELx_WFx_ISS_WFI) /* BRK instruction trap from AArch64 state */ #define ESR_ELx_BRK64_ISS_COMMENT_MASK 0xffff /* ISS field definitions for System instruction traps */ #define ESR_ELx_SYS64_ISS_RES0_SHIFT 22 #define ESR_ELx_SYS64_ISS_RES0_MASK (UL(0x7) << ESR_ELx_SYS64_ISS_RES0_SHIFT) #define ESR_ELx_SYS64_ISS_DIR_MASK 0x1 #define ESR_ELx_SYS64_ISS_DIR_READ 0x1 #define ESR_ELx_SYS64_ISS_DIR_WRITE 0x0 #define ESR_ELx_SYS64_ISS_RT_SHIFT 5 #define ESR_ELx_SYS64_ISS_RT_MASK (UL(0x1f) << ESR_ELx_SYS64_ISS_RT_SHIFT) #define ESR_ELx_SYS64_ISS_CRM_SHIFT 1 #define ESR_ELx_SYS64_ISS_CRM_MASK (UL(0xf) << ESR_ELx_SYS64_ISS_CRM_SHIFT) #define ESR_ELx_SYS64_ISS_CRN_SHIFT 10 #define ESR_ELx_SYS64_ISS_CRN_MASK (UL(0xf) << ESR_ELx_SYS64_ISS_CRN_SHIFT) #define ESR_ELx_SYS64_ISS_OP1_SHIFT 14 #define ESR_ELx_SYS64_ISS_OP1_MASK (UL(0x7) << ESR_ELx_SYS64_ISS_OP1_SHIFT) #define ESR_ELx_SYS64_ISS_OP2_SHIFT 17 #define ESR_ELx_SYS64_ISS_OP2_MASK (UL(0x7) << ESR_ELx_SYS64_ISS_OP2_SHIFT) #define ESR_ELx_SYS64_ISS_OP0_SHIFT 20 #define ESR_ELx_SYS64_ISS_OP0_MASK (UL(0x3) << ESR_ELx_SYS64_ISS_OP0_SHIFT) #define ESR_ELx_SYS64_ISS_SYS_MASK (ESR_ELx_SYS64_ISS_OP0_MASK | \ ESR_ELx_SYS64_ISS_OP1_MASK | \ ESR_ELx_SYS64_ISS_OP2_MASK | \ ESR_ELx_SYS64_ISS_CRN_MASK | \ ESR_ELx_SYS64_ISS_CRM_MASK) #define ESR_ELx_SYS64_ISS_SYS_VAL(op0, op1, op2, crn, crm) \ (((op0) << ESR_ELx_SYS64_ISS_OP0_SHIFT) | \ ((op1) << ESR_ELx_SYS64_ISS_OP1_SHIFT) | \ ((op2) << ESR_ELx_SYS64_ISS_OP2_SHIFT) | \ ((crn) << ESR_ELx_SYS64_ISS_CRN_SHIFT) | \ ((crm) << ESR_ELx_SYS64_ISS_CRM_SHIFT)) #define ESR_ELx_SYS64_ISS_SYS_OP_MASK (ESR_ELx_SYS64_ISS_SYS_MASK | \ ESR_ELx_SYS64_ISS_DIR_MASK) #define ESR_ELx_SYS64_ISS_RT(esr) \ (((esr) & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT) /* * User space cache operations have the following sysreg encoding * in System instructions. * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 13, 14 }, WRITE (L=0) */ #define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 #define ESR_ELx_SYS64_ISS_CRM_DC_CVADP 13 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 #define ESR_ELx_SYS64_ISS_CRM_IC_IVAU 5 #define ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK (ESR_ELx_SYS64_ISS_OP0_MASK | \ ESR_ELx_SYS64_ISS_OP1_MASK | \ ESR_ELx_SYS64_ISS_OP2_MASK | \ ESR_ELx_SYS64_ISS_CRN_MASK | \ ESR_ELx_SYS64_ISS_DIR_MASK) #define ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL \ (ESR_ELx_SYS64_ISS_SYS_VAL(1, 3, 1, 7, 0) | \ ESR_ELx_SYS64_ISS_DIR_WRITE) /* * User space MRS operations which are supported for emulation * have the following sysreg encoding in System instructions. * op0 = 3, op1= 0, crn = 0, {crm = 0, 4-7}, READ (L = 1) */ #define ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK (ESR_ELx_SYS64_ISS_OP0_MASK | \ ESR_ELx_SYS64_ISS_OP1_MASK | \ ESR_ELx_SYS64_ISS_CRN_MASK | \ ESR_ELx_SYS64_ISS_DIR_MASK) #define ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL \ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 0, 0, 0, 0) | \ ESR_ELx_SYS64_ISS_DIR_READ) #define ESR_ELx_SYS64_ISS_SYS_CTR ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 1, 0, 0) #define ESR_ELx_SYS64_ISS_SYS_CTR_READ (ESR_ELx_SYS64_ISS_SYS_CTR | \ ESR_ELx_SYS64_ISS_DIR_READ) #define ESR_ELx_SYS64_ISS_SYS_CNTVCT (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \ ESR_ELx_SYS64_ISS_DIR_READ) #define ESR_ELx_SYS64_ISS_SYS_CNTVCTSS (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 6, 14, 0) | \ ESR_ELx_SYS64_ISS_DIR_READ) #define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \ ESR_ELx_SYS64_ISS_DIR_READ) #define esr_sys64_to_sysreg(e) \ sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >> \ ESR_ELx_SYS64_ISS_OP0_SHIFT), \ (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ ESR_ELx_SYS64_ISS_OP1_SHIFT), \ (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ ESR_ELx_SYS64_ISS_CRN_SHIFT), \ (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ ESR_ELx_SYS64_ISS_CRM_SHIFT), \ (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ ESR_ELx_SYS64_ISS_OP2_SHIFT)) #define esr_cp15_to_sysreg(e) \ sys_reg(3, \ (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ ESR_ELx_SYS64_ISS_OP1_SHIFT), \ (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ ESR_ELx_SYS64_ISS_CRN_SHIFT), \ (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ ESR_ELx_SYS64_ISS_CRM_SHIFT), \ (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ ESR_ELx_SYS64_ISS_OP2_SHIFT)) /* ISS field definitions for ERET/ERETAA/ERETAB trapping */ #define ESR_ELx_ERET_ISS_ERET 0x2 #define ESR_ELx_ERET_ISS_ERETA 0x1 /* * ISS field definitions for floating-point exception traps * (FP_EXC_32/FP_EXC_64). * * (The FPEXC_* constants are used instead for common bits.) */ #define ESR_ELx_FP_EXC_TFV (UL(1) << 23) /* * ISS field definitions for CP15 accesses */ #define ESR_ELx_CP15_32_ISS_DIR_MASK 0x1 #define ESR_ELx_CP15_32_ISS_DIR_READ 0x1 #define ESR_ELx_CP15_32_ISS_DIR_WRITE 0x0 #define ESR_ELx_CP15_32_ISS_RT_SHIFT 5 #define ESR_ELx_CP15_32_ISS_RT_MASK (UL(0x1f) << ESR_ELx_CP15_32_ISS_RT_SHIFT) #define ESR_ELx_CP15_32_ISS_CRM_SHIFT 1 #define ESR_ELx_CP15_32_ISS_CRM_MASK (UL(0xf) << ESR_ELx_CP15_32_ISS_CRM_SHIFT) #define ESR_ELx_CP15_32_ISS_CRN_SHIFT 10 #define ESR_ELx_CP15_32_ISS_CRN_MASK (UL(0xf) << ESR_ELx_CP15_32_ISS_CRN_SHIFT) #define ESR_ELx_CP15_32_ISS_OP1_SHIFT 14 #define ESR_ELx_CP15_32_ISS_OP1_MASK (UL(0x7) << ESR_ELx_CP15_32_ISS_OP1_SHIFT) #define ESR_ELx_CP15_32_ISS_OP2_SHIFT 17 #define ESR_ELx_CP15_32_ISS_OP2_MASK (UL(0x7) << ESR_ELx_CP15_32_ISS_OP2_SHIFT) #define ESR_ELx_CP15_32_ISS_SYS_MASK (ESR_ELx_CP15_32_ISS_OP1_MASK | \ ESR_ELx_CP15_32_ISS_OP2_MASK | \ ESR_ELx_CP15_32_ISS_CRN_MASK | \ ESR_ELx_CP15_32_ISS_CRM_MASK | \ ESR_ELx_CP15_32_ISS_DIR_MASK) #define ESR_ELx_CP15_32_ISS_SYS_VAL(op1, op2, crn, crm) \ (((op1) << ESR_ELx_CP15_32_ISS_OP1_SHIFT) | \ ((op2) << ESR_ELx_CP15_32_ISS_OP2_SHIFT) | \ ((crn) << ESR_ELx_CP15_32_ISS_CRN_SHIFT) | \ ((crm) << ESR_ELx_CP15_32_ISS_CRM_SHIFT)) #define ESR_ELx_CP15_64_ISS_DIR_MASK 0x1 #define ESR_ELx_CP15_64_ISS_DIR_READ 0x1 #define ESR_ELx_CP15_64_ISS_DIR_WRITE 0x0 #define ESR_ELx_CP15_64_ISS_RT_SHIFT 5 #define ESR_ELx_CP15_64_ISS_RT_MASK (UL(0x1f) << ESR_ELx_CP15_64_ISS_RT_SHIFT) #define ESR_ELx_CP15_64_ISS_RT2_SHIFT 10 #define ESR_ELx_CP15_64_ISS_RT2_MASK (UL(0x1f) << ESR_ELx_CP15_64_ISS_RT2_SHIFT) #define ESR_ELx_CP15_64_ISS_OP1_SHIFT 16 #define ESR_ELx_CP15_64_ISS_OP1_MASK (UL(0xf) << ESR_ELx_CP15_64_ISS_OP1_SHIFT) #define ESR_ELx_CP15_64_ISS_CRM_SHIFT 1 #define ESR_ELx_CP15_64_ISS_CRM_MASK (UL(0xf) << ESR_ELx_CP15_64_ISS_CRM_SHIFT) #define ESR_ELx_CP15_64_ISS_SYS_VAL(op1, crm) \ (((op1) << ESR_ELx_CP15_64_ISS_OP1_SHIFT) | \ ((crm) << ESR_ELx_CP15_64_ISS_CRM_SHIFT)) #define ESR_ELx_CP15_64_ISS_SYS_MASK (ESR_ELx_CP15_64_ISS_OP1_MASK | \ ESR_ELx_CP15_64_ISS_CRM_MASK | \ ESR_ELx_CP15_64_ISS_DIR_MASK) #define ESR_ELx_CP15_64_ISS_SYS_CNTVCT (ESR_ELx_CP15_64_ISS_SYS_VAL(1, 14) | \ ESR_ELx_CP15_64_ISS_DIR_READ) #define ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS (ESR_ELx_CP15_64_ISS_SYS_VAL(9, 14) | \ ESR_ELx_CP15_64_ISS_DIR_READ) #define ESR_ELx_CP15_32_ISS_SYS_CNTFRQ (ESR_ELx_CP15_32_ISS_SYS_VAL(0, 0, 14, 0) |\ ESR_ELx_CP15_32_ISS_DIR_READ) /* * ISS values for SME traps */ #define ESR_ELx_SME_ISS_SME_DISABLED 0 #define ESR_ELx_SME_ISS_ILL 1 #define ESR_ELx_SME_ISS_SM_DISABLED 2 #define ESR_ELx_SME_ISS_ZA_DISABLED 3 #define ESR_ELx_SME_ISS_ZT_DISABLED 4 /* ISS field definitions for MOPS exceptions */ #define ESR_ELx_MOPS_ISS_MEM_INST (UL(1) << 24) #define ESR_ELx_MOPS_ISS_FROM_EPILOGUE (UL(1) << 18) #define ESR_ELx_MOPS_ISS_WRONG_OPTION (UL(1) << 17) #define ESR_ELx_MOPS_ISS_OPTION_A (UL(1) << 16) #define ESR_ELx_MOPS_ISS_DESTREG(esr) (((esr) & (UL(0x1f) << 10)) >> 10) #define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5) #define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0) /* ISS field definitions for GCS */ #define ESR_ELx_ExType_SHIFT (20) #define ESR_ELx_ExType_MASK GENMASK(23, 20) #define ESR_ELx_Raddr_SHIFT (10) #define ESR_ELx_Raddr_MASK GENMASK(14, 10) #define ESR_ELx_Rn_SHIFT (5) #define ESR_ELx_Rn_MASK GENMASK(9, 5) #define ESR_ELx_Rvalue_SHIFT 5 #define ESR_ELx_Rvalue_MASK GENMASK(9, 5) #define ESR_ELx_IT_SHIFT (0) #define ESR_ELx_IT_MASK GENMASK(4, 0) #define ESR_ELx_ExType_DATA_CHECK 0 #define ESR_ELx_ExType_EXLOCK 1 #define ESR_ELx_ExType_STR 2 #define ESR_ELx_IT_RET 0 #define ESR_ELx_IT_GCSPOPM 1 #define ESR_ELx_IT_RET_KEYA 2 #define ESR_ELx_IT_RET_KEYB 3 #define ESR_ELx_IT_GCSSS1 4 #define ESR_ELx_IT_GCSSS2 5 #define ESR_ELx_IT_GCSPOPCX 6 #define ESR_ELx_IT_GCSPOPX 7 #ifndef __ASSEMBLY__ #include <asm/types.h> static inline unsigned long esr_brk_comment(unsigned long esr) { return esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; } static inline bool esr_is_data_abort(unsigned long esr) { const unsigned long ec = ESR_ELx_EC(esr); return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR; } static inline bool esr_is_cfi_brk(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && (esr_brk_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE; } static inline bool esr_is_ubsan_brk(unsigned long esr) { return (esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM; } static inline bool esr_fsc_is_translation_fault(unsigned long esr) { esr = esr & ESR_ELx_FSC; return (esr == ESR_ELx_FSC_FAULT_L(3)) || (esr == ESR_ELx_FSC_FAULT_L(2)) || (esr == ESR_ELx_FSC_FAULT_L(1)) || (esr == ESR_ELx_FSC_FAULT_L(0)) || (esr == ESR_ELx_FSC_FAULT_L(-1)); } static inline bool esr_fsc_is_permission_fault(unsigned long esr) { esr = esr & ESR_ELx_FSC; return (esr == ESR_ELx_FSC_PERM_L(3)) || (esr == ESR_ELx_FSC_PERM_L(2)) || (esr == ESR_ELx_FSC_PERM_L(1)) || (esr == ESR_ELx_FSC_PERM_L(0)); } static inline bool esr_fsc_is_access_flag_fault(unsigned long esr) { esr = esr & ESR_ELx_FSC; return (esr == ESR_ELx_FSC_ACCESS_L(3)) || (esr == ESR_ELx_FSC_ACCESS_L(2)) || (esr == ESR_ELx_FSC_ACCESS_L(1)) || (esr == ESR_ELx_FSC_ACCESS_L(0)); } static inline bool esr_fsc_is_addr_sz_fault(unsigned long esr) { esr &= ESR_ELx_FSC; return (esr == ESR_ELx_FSC_ADDRSZ_L(3)) || (esr == ESR_ELx_FSC_ADDRSZ_L(2)) || (esr == ESR_ELx_FSC_ADDRSZ_L(1)) || (esr == ESR_ELx_FSC_ADDRSZ_L(0)) || (esr == ESR_ELx_FSC_ADDRSZ_L(-1)); } static inline bool esr_fsc_is_sea_ttw(unsigned long esr) { esr = esr & ESR_ELx_FSC; return (esr == ESR_ELx_FSC_SEA_TTW(3)) || (esr == ESR_ELx_FSC_SEA_TTW(2)) || (esr == ESR_ELx_FSC_SEA_TTW(1)) || (esr == ESR_ELx_FSC_SEA_TTW(0)) || (esr == ESR_ELx_FSC_SEA_TTW(-1)); } static inline bool esr_fsc_is_secc_ttw(unsigned long esr) { esr = esr & ESR_ELx_FSC; return (esr == ESR_ELx_FSC_SECC_TTW(3)) || (esr == ESR_ELx_FSC_SECC_TTW(2)) || (esr == ESR_ELx_FSC_SECC_TTW(1)) || (esr == ESR_ELx_FSC_SECC_TTW(0)) || (esr == ESR_ELx_FSC_SECC_TTW(-1)); } /* Indicate whether ESR.EC==0x1A is for an ERETAx instruction */ static inline bool esr_iss_is_eretax(unsigned long esr) { return esr & ESR_ELx_ERET_ISS_ERET; } /* Indicate which key is used for ERETAx (false: A-Key, true: B-Key) */ static inline bool esr_iss_is_eretab(unsigned long esr) { return esr & ESR_ELx_ERET_ISS_ERETA; } const char *esr_get_class_string(unsigned long esr); #endif /* __ASSEMBLY */ #endif /* __ASM_ESR_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/ns_common.h> #include <linux/fs_pin.h> extern struct list_head notify_list; typedef __u32 __bitwise mntns_flags_t; #define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0)) struct mnt_namespace { struct ns_common ns; struct mount * root; struct { struct rb_root mounts; /* Protected by namespace_sem */ struct rb_node *mnt_last_node; /* last (rightmost) mount in the rbtree */ struct rb_node *mnt_first_node; /* first (leftmost) mount in the rbtree */ }; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ union { wait_queue_head_t poll; struct rcu_head mnt_ns_rcu; }; u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; #ifdef CONFIG_FSNOTIFY __u32 n_fsnotify_mask; struct fsnotify_mark_connector __rcu *n_fsnotify_marks; #endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ mntns_flags_t mntns_flags; } __randomize_layout; struct mnt_pcp { int mnt_count; int mnt_writers; }; struct mountpoint { struct hlist_node m_hash; struct dentry *m_dentry; struct hlist_head m_list; int m_count; }; struct mount { struct hlist_node mnt_hash; struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; union { struct rb_node mnt_node; /* node in the ns->mounts rbtree */ struct rcu_head mnt_rcu; struct llist_node mnt_llist; }; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; #else int mnt_count; int mnt_writers; #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ struct list_head mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ union { struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ struct hlist_node mnt_umount; }; struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; struct list_head to_notify; /* need to queue notification */ struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ #endif int mnt_id; /* mount identifier, reused */ u64 mnt_id_unique; /* mount ID unique until reboot */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; } __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ static inline struct mount *real_mount(struct vfsmount *mnt) { return container_of(mnt, struct mount, mnt); } static inline int mnt_has_parent(struct mount *mnt) { return mnt != mnt->mnt_parent; } static inline int is_mounted(struct vfsmount *mnt) { /* neither detached nor internal? */ return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); static inline bool __path_is_mountpoint(const struct path *path) { struct mount *m = __lookup_mnt(path->mnt, path->dentry); return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT)); } extern void __detach_mounts(struct dentry *dentry); static inline void detach_mounts(struct dentry *dentry) { if (!d_mountpoint(dentry)) return; __detach_mounts(dentry); } static inline void get_mnt_ns(struct mnt_namespace *ns) { refcount_inc(&ns->ns.count); } extern seqlock_t mount_lock; struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); }; extern const struct seq_operations mounts_op; extern bool __is_local_mountpoint(struct dentry *dentry); static inline bool is_local_mountpoint(struct dentry *dentry) { if (!d_mountpoint(dentry)) return false; return __is_local_mountpoint(dentry); } static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } static inline bool mnt_ns_attached(const struct mount *mnt) { return !RB_EMPTY_NODE(&mnt->mnt_node); } static inline bool mnt_ns_empty(const struct mnt_namespace *ns) { return RB_EMPTY_ROOT(&ns->mounts); } static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { struct mnt_namespace *ns = mnt->mnt_ns; WARN_ON(!mnt_ns_attached(mnt)); if (ns->mnt_last_node == &mnt->mnt_node) ns->mnt_last_node = rb_prev(&mnt->mnt_node); if (ns->mnt_first_node == &mnt->mnt_node) ns->mnt_first_node = rb_next(&mnt->mnt_node); rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); list_add_tail(&mnt->mnt_list, dt_list); } bool has_locked_children(struct mount *mnt, struct dentry *dentry); struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns, bool previous); static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } #ifdef CONFIG_FSNOTIFY static inline void mnt_notify_add(struct mount *m) { /* Optimize the case where there are no watches */ if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) || (m->prev_ns && m->prev_ns->n_fsnotify_marks)) list_add_tail(&m->to_notify, &notify_list); else m->prev_ns = m->mnt_ns; } #else static inline void mnt_notify_add(struct mount *m) { } #endif struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
232 232 232 232 231 231 232 232 232 232 243 244 244 244 244 243 244 243 244 244 241 187 188 188 188 188 188 187 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012-2015 - ARM Ltd * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <hyp/sysreg-sr.h> #include <linux/compiler.h> #include <linux/kvm_host.h> #include <asm/kprobes.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_nested.h> static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) { /* These registers are common with EL1 */ __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); /* * In VHE mode those registers are compatible between EL1 and EL2, * and the guest uses the _EL1 versions on the CPU naturally. * So we save them into their _EL2 versions here. * For nVHE mode we trap accesses to those registers, so our * _EL2 copy in sys_regs[] is always up-to-date and we don't need * to save anything here. */ if (vcpu_el2_e2h_is_set(vcpu)) { u64 val; /* * We don't save CPTR_EL2, as accesses to CPACR_EL1 * are always trapped, ensuring that the in-memory * copy is always up-to-date. A small blessing... */ __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); } /* * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where * the interesting CNTHCTL_EL2 bits live. So preserve these * bits when reading back the guest-visible value. */ val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; } __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); } static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) { u64 val; /* These registers are common with EL1 */ write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1), par_el1); write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1), tpidr_el1); write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt), vpidr_el2); write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1), vmpidr_el2); write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2), SYS_MAIR); write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2), SYS_VBAR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2), SYS_CONTEXTIDR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2), SYS_AMAIR); if (vcpu_el2_e2h_is_set(vcpu)) { /* * In VHE mode those registers are compatible between * EL1 and EL2. */ write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2), SYS_SCTLR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2), SYS_CPACR); write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2), SYS_TTBR0); write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2), SYS_TTBR1); write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2), SYS_TCR); write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL); } else { /* * CNTHCTL_EL2 only affects EL1 when running nVHE, so * no need to restore it. */ val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2)); write_sysreg_el1(val, SYS_SCTLR); val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2)); write_sysreg_el1(val, SYS_CPACR); val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2)); write_sysreg_el1(val, SYS_TTBR0); val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2)); write_sysreg_el1(val, SYS_TCR); } if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR); write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR); } write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2), SYS_ESR); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2), SYS_AFSR0); write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2), SYS_AFSR1); write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2), SYS_FAR); write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2), sp_el1); write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2), SYS_ELR); write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2), SYS_SPSR); } /* * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and * pstate, which are handled as part of the el2 return state) on every * switch (sp_el0 is being dealt with in the assembly code). * tpidr_el0 and tpidrro_el0 only need to be switched when going * to host userspace or a different VCPU. EL1 registers only need to be * switched when potentially going to run a different VCPU. The latter two * classes are handled as part of kvm_arch_vcpu_load and kvm_arch_vcpu_put. */ void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_save_common_state(ctxt); } NOKPROBE_SYMBOL(sysreg_save_host_state_vhe); void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_save_common_state(ctxt); __sysreg_save_el2_return_state(ctxt); } NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe); void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_restore_common_state(ctxt); } NOKPROBE_SYMBOL(sysreg_restore_host_state_vhe); void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) { __sysreg_restore_common_state(ctxt); __sysreg_restore_el2_return_state(ctxt); } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * * @vcpu: The VCPU pointer * * Load system registers that do not affect the host's execution, for * example EL1 system registers on a VHE system where the host kernel * runs at EL2. This function is called from KVM's vcpu_load() function * and loading system register state early avoids having to load them on * every entry to the VM. */ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; u64 midr, mpidr; host_ctxt = host_data_ptr(host_ctxt); __sysreg_save_user_state(host_ctxt); /* * When running a normal EL1 guest, we only load a new vcpu * after a context switch, which imvolves a DSB, so all * speculative EL1&0 walks will have already completed. * If running NV, the vcpu may transition between vEL1 and * vEL2 without a context switch, so make sure we complete * those walks before loading a new context. */ if (vcpu_has_nv(vcpu)) dsb(nsh); /* * Load guest EL1 and user state * * We must restore the 32-bit state before the sysregs, thanks * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { /* * As we're restoring a nested guest, set the value * provided by the guest hypervisor. */ midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2); mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2); } else { midr = ctxt_midr_el1(guest_ctxt); mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1); } __sysreg_restore_el1_state(guest_ctxt, midr, mpidr); } vcpu_set_flag(vcpu, SYSREGS_ON_CPU); } /** * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU * * @vcpu: The VCPU pointer * * Save guest system registers that do not affect the host's execution, for * example EL1 system registers on a VHE system where the host kernel * runs at EL2. This function is called from KVM's vcpu_put() function * and deferring saving system register state until we're no longer running the * VCPU avoids having to save them on every exit from the VM. */ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt; struct kvm_cpu_context *host_ctxt; host_ctxt = host_data_ptr(host_ctxt); if (unlikely(is_hyp_ctxt(vcpu))) __sysreg_save_vel2_state(vcpu); else __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); __sysreg32_save_state(vcpu); /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); }
286 286 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/of.h> #include <linux/of_device.h> #include <linux/of_address.h> #include <linux/of_iommu.h> #include <linux/of_reserved_mem.h> #include <linux/dma-direct.h> /* for bus_dma_region */ #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/mod_devicetable.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <asm/errno.h> #include "of_private.h" /** * of_match_device - Tell if a struct device matches an of_device_id list * @matches: array of of device match structures to search in * @dev: the of device structure to match against * * Used by a driver to check whether an platform_device present in the * system is in its list of supported devices. */ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } EXPORT_SYMBOL(of_match_device); static void of_dma_set_restricted_buffer(struct device *dev, struct device_node *np) { struct device_node *node, *of_node = dev->of_node; int count, i; if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL)) return; count = of_property_count_elems_of_size(of_node, "memory-region", sizeof(u32)); /* * If dev->of_node doesn't exist or doesn't contain memory-region, try * the OF node having DMA configuration. */ if (count <= 0) { of_node = np; count = of_property_count_elems_of_size( of_node, "memory-region", sizeof(u32)); } for (i = 0; i < count; i++) { node = of_parse_phandle(of_node, "memory-region", i); /* * There might be multiple memory regions, but only one * restricted-dma-pool region is allowed. */ if (of_device_is_compatible(node, "restricted-dma-pool") && of_device_is_available(node)) { of_node_put(node); break; } of_node_put(node); } /* * Attempt to initialize a restricted-dma-pool region if one was found. * Note that count can hold a negative error code. */ if (i < count && of_reserved_mem_device_init_by_idx(dev, of_node, i)) dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n"); } /** * of_dma_configure_id - Setup DMA configuration * @dev: Device to apply DMA configuration * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. * * If platform code needs to use its own special DMA configuration, it * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ int of_dma_configure_id(struct device *dev, struct device_node *np, bool force_dma, const u32 *id) { const struct bus_dma_region *map = NULL; struct device_node *bus_np; u64 mask, end = 0; bool coherent, set_map = false; int ret; if (dev->dma_range_map) { dev_dbg(dev, "dma_range_map already set\n"); goto skip_map; } if (np == dev->of_node) bus_np = __of_get_dma_parent(np); else bus_np = of_node_get(np); ret = of_dma_get_range(bus_np, &map); of_node_put(bus_np); if (ret < 0) { /* * For legacy reasons, we have to assume some devices need * DMA configuration regardless of whether "dma-ranges" is * correctly specified or not. */ if (!force_dma) return ret == -ENODEV ? 0 : ret; } else { /* Determine the overall bounds of all DMA regions */ end = dma_range_map_max(map); set_map = true; } skip_map: /* * If @dev is expected to be DMA-capable then the bus code that created * it should have initialised its dma_mask pointer by this point. For * now, we'll continue the legacy behaviour of coercing it to the * coherent mask if not, but we'll no longer do so quietly. */ if (!dev->dma_mask) { dev_warn(dev, "DMA mask not set\n"); dev->dma_mask = &dev->coherent_dma_mask; } if (!end && dev->coherent_dma_mask) end = dev->coherent_dma_mask; else if (!end) end = (1ULL << 32) - 1; /* * Limit coherent and dma mask based on size and default mask * set by the driver. */ mask = DMA_BIT_MASK(ilog2(end) + 1); dev->coherent_dma_mask &= mask; *dev->dma_mask &= mask; /* ...but only set bus limit and range map if we found valid dma-ranges earlier */ if (set_map) { dev->bus_dma_limit = end; dev->dma_range_map = map; } coherent = of_dma_is_coherent(np); dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); ret = of_iommu_configure(dev, np, id); if (ret == -EPROBE_DEFER) { /* Don't touch range map if it wasn't set from a valid dma-ranges */ if (set_map) dev->dma_range_map = NULL; kfree(map); return -EPROBE_DEFER; } /* Take all other IOMMU errors to mean we'll just carry on without it */ dev_dbg(dev, "device is%sbehind an iommu\n", !ret ? " " : " not "); arch_setup_dma_ops(dev, coherent); if (ret) of_dma_set_restricted_buffer(dev, np); return 0; } EXPORT_SYMBOL_GPL(of_dma_configure_id); const void *of_device_get_match_data(const struct device *dev) { const struct of_device_id *match; match = of_match_device(dev->driver->of_match_table, dev); if (!match) return NULL; return match->data; } EXPORT_SYMBOL(of_device_get_match_data); /** * of_device_modalias - Fill buffer with newline terminated modalias string * @dev: Calling device * @str: Modalias string * @len: Size of @str */ ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len) { ssize_t sl; if (!dev || !dev->of_node || dev->of_node_reused) return -ENODEV; sl = of_modalias(dev->of_node, str, len - 2); if (sl < 0) return sl; if (sl > len - 2) return -ENOMEM; str[sl++] = '\n'; str[sl] = 0; return sl; } EXPORT_SYMBOL_GPL(of_device_modalias); /** * of_device_uevent - Display OF related uevent information * @dev: Device to display the uevent information for * @env: Kernel object's userspace event reference to fill up */ void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { const char *compat, *type; struct alias_prop *app; struct property *p; int seen = 0; if ((!dev) || (!dev->of_node)) return; add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node); add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node); type = of_node_get_device_type(dev->of_node); if (type) add_uevent_var(env, "OF_TYPE=%s", type); /* Since the compatible field can contain pretty much anything * it's not really legal to split it out with commas. We split it * up using a number of environment variables instead. */ of_property_for_each_string(dev->of_node, "compatible", p, compat) { add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat); seen++; } add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen); seen = 0; mutex_lock(&of_mutex); list_for_each_entry(app, &aliases_lookup, link) { if (dev->of_node == app->np) { add_uevent_var(env, "OF_ALIAS_%d=%s", seen, app->alias); seen++; } } mutex_unlock(&of_mutex); } EXPORT_SYMBOL_GPL(of_device_uevent); int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { int sl; if ((!dev) || (!dev->of_node) || dev->of_node_reused) return -ENODEV; /* Devicetree modalias is tricky, we add it in 2 steps */ if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; sl = of_modalias(dev->of_node, &env->buf[env->buflen-1], sizeof(env->buf) - env->buflen); if (sl < 0) return sl; if (sl >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += sl; return 0; } EXPORT_SYMBOL_GPL(of_device_uevent_modalias); /** * of_device_make_bus_id - Use the device node data to assign a unique name * @dev: pointer to device structure that is linked to a device tree node * * This routine will first try using the translated bus address to * derive a unique name. If it cannot, then it will prepend names from * parent nodes until a unique name can be derived. */ void of_device_make_bus_id(struct device *dev) { struct device_node *node = dev->of_node; const __be32 *reg; u64 addr; u32 mask; /* Construct the name, using parent nodes if necessary to ensure uniqueness */ while (node->parent) { /* * If the address can be translated, then that is as much * uniqueness as we need. Make it the first component and return */ reg = of_get_property(node, "reg", NULL); if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) { if (!of_property_read_u32(node, "mask", &mask)) dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn", addr, ffs(mask) - 1, node, dev_name(dev)); else dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn", addr, node, dev_name(dev)); return; } /* format arguments only used if dev_name() resolves to NULL */ dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s", kbasename(node->full_name), dev_name(dev)); node = node->parent; } } EXPORT_SYMBOL_GPL(of_device_make_bus_id);
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/syscalls.h> #include <linux/time_namespace.h> #include "futex.h" /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. * * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after * acquiring the lock, but just before it could have added itself to * the list. There can only be one such pending lock. */ /** * sys_set_robust_list() - Set the robust-futex list head of a task * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { /* * The kernel knows only one size for now: */ if (unlikely(len != sizeof(*head))) return -EINVAL; current->robust_list = head; return 0; } /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] * @head_ptr: pointer to a list-head pointer, the kernel fills it in * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { struct robust_list_head __user *head; unsigned long ret; struct task_struct *p; rcu_read_lock(); ret = -ESRCH; if (!pid) p = current; else { p = find_task_by_vpid(pid); if (!p) goto err_unlock; } ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; rcu_read_unlock(); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); err_unlock: rcu_read_unlock(); return ret; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; if (flags & FLAGS_CLOCKRT) { if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: flags |= FLAGS_CLOCKRT; fallthrough; case FUTEX_LOCK_PI2: return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } static __always_inline bool futex_cmd_has_timeout(u32 cmd) { switch (cmd) { case FUTEX_WAIT: case FUTEX_LOCK_PI: case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: return true; } return false; } static __always_inline int futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) { if (!timespec64_valid(ts)) return -EINVAL; *t = timespec64_to_ktime(*ts); if (cmd == FUTEX_WAIT) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } /** * futex_parse_waitv - Parse a waitv array from userspace * @futexv: Kernel side list of waiters to be filled * @uwaitv: Userspace list to be parsed * @nr_futexes: Length of futexv * @wake: Wake to call when futex is woken * @wake_data: Data for the wake handler * * Return: Error code on failure, 0 on success */ int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data) { struct futex_waitv aux; unsigned int i; for (i = 0; i < nr_futexes; i++) { unsigned int flags; if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; flags = futex2_to_flags(aux.flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, aux.val)) return -EINVAL; futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; futexv[i].q.wake = wake; futexv[i].q.wake_data = wake_data; } return 0; } static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, clockid_t clockid, struct hrtimer_sleeper *to) { int flag_clkid = 0, flag_init = 0; struct timespec64 ts; ktime_t time; int ret; if (!timeout) return 0; if (clockid == CLOCK_REALTIME) { flag_clkid = FLAGS_CLOCKRT; flag_init = FUTEX_CLOCK_REALTIME; } if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return -EINVAL; if (get_timespec64(&ts, timeout)) return -EFAULT; /* * Since there's no opcode for futex_waitv, use * FUTEX_WAIT_BITSET that uses absolute timeout as well */ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); if (ret) return ret; futex_setup_timer(&time, to, flag_clkid, 0); return 0; } static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); } /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on * @nr_futexes: Length of futexv * @flags: Flag for timeout (monotonic/realtime) * @timeout: Optional absolute timeout. * @clockid: Clock to be used for the timeout, realtime or monotonic. * * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes * if a futex_wake() is performed at any uaddr. The syscall returns immediately * if any waiter has *uaddr != val. *timeout is an optional timeout value for * the operation. Each waiter has individual flags. The `flags` argument for * the syscall should be used solely for specifying the timeout as realtime, if * needed. Flags for private futexes, sizes, etc. should be used on the * individual flags of each waiter. * * Returns the array index of one of the woken futexes. No further information * is provided: any number of other futexes may also have been woken by the * same event, and if more than one futex was woken, the retrned index may * refer to any one of them. (It is not necessaryily the futex with the * smallest index, nor the one most recently woken, nor...) */ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, unsigned int, nr_futexes, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; struct futex_vector *futexv; int ret; /* This syscall supports no flags for now */ if (flags) return -EINVAL; if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); if (!futexv) { ret = -ENOMEM; goto destroy_timer; } ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark, NULL); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); kfree(futexv); destroy_timer: if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_wake - Wake a number of futexes * @uaddr: Address of the futex(es) to wake * @mask: bitmask * @nr: Number of the futexes to wake * @flags: FUTEX2 flags * * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_wake, void __user *, uaddr, unsigned long, mask, int, nr, unsigned int, flags) { if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, mask)) return -EINVAL; return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); } /* * sys_futex_wait - Wait on a futex * @uaddr: Address of the futex to wait on * @val: Value of @uaddr * @mask: bitmask * @flags: FUTEX2 flags * @timeout: Optional absolute timeout * @clockid: Clock to be used for the timeout, realtime or monotonic * * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the * futex2 familiy of calls. */ SYSCALL_DEFINE6(futex_wait, void __user *, uaddr, unsigned long, val, unsigned long, mask, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; int ret; if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, val) || !futex_validate_input(flags, mask)) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_requeue - Requeue a waiter from one futex to another * @waiters: array describing the source and destination futex * @flags: unused * @nr_wake: number of futexes to wake * @nr_requeue: number of futexes to requeue * * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_requeue, struct futex_waitv __user *, waiters, unsigned int, flags, int, nr_wake, int, nr_requeue) { struct futex_vector futexes[2]; u32 cmpval; int ret; if (flags) return -EINVAL; if (!waiters) return -EINVAL; ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL); if (ret) return ret; cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, nr_wake, nr_requeue, &cmpval, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { if (unlikely(len != sizeof(*head))) return -EINVAL; current->compat_robust_list = head; return 0; } COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head; unsigned long ret; struct task_struct *p; rcu_read_lock(); ret = -ESRCH; if (!pid) p = current; else { p = find_task_by_vpid(pid); if (!p) goto err_unlock; } ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; rcu_read_unlock(); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); err_unlock: rcu_read_unlock(); return ret; } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */
264 323 4 279 44 319 40 40 25 25 24 25 25 24 25 25 24 25 25 25 25 299 300 25 25 125 125 125 125 514 5 5 5 4 1 5 4 4 4 4 4 4 4 5 1 4 4 4 5 5 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 // SPDX-License-Identifier: GPL-2.0-only /* * fs/libfs.c * Library for filesystems writers. */ #include <linux/blkdev.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> #include <linux/pidfs.h> #include <linux/uaccess.h> #include "internal.h" int simple_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { u64 id = huge_encode_dev(dentry->d_sb->s_dev); buf->f_fsid = u64_to_fsid(id); buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; return 0; } EXPORT_SYMBOL(simple_statfs); /* * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ int always_delete_dentry(const struct dentry *dentry) { return 1; } EXPORT_SYMBOL(always_delete_dentry); const struct dentry_operations simple_dentry_operations = { .d_delete = always_delete_dentry, }; EXPORT_SYMBOL(simple_dentry_operations); /* * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. Set d_op to delete negative dentries. */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_sb->s_d_op) d_set_d_op(dentry, &simple_dentry_operations); if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) return NULL; d_add(dentry, NULL); return NULL; } EXPORT_SYMBOL(simple_lookup); int dcache_dir_open(struct inode *inode, struct file *file) { file->private_data = d_alloc_cursor(file->f_path.dentry); return file->private_data ? 0 : -ENOMEM; } EXPORT_SYMBOL(dcache_dir_open); int dcache_dir_close(struct inode *inode, struct file *file) { dput(file->private_data); return 0; } EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ /* * Returns an element of siblings' list. * We are looking for <count>th positive after <p>; if * found, dentry is grabbed and returned to caller. * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, struct hlist_node **p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); while (*p) { struct dentry *d = hlist_entry(*p, struct dentry, d_sib); p = &d->d_sib.next; // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; if (simple_positive(d) && !--count) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) found = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(found)) break; count = 1; } if (need_resched()) { if (!hlist_unhashed(&cursor->d_sib)) __hlist_del(&cursor->d_sib); hlist_add_behind(&cursor->d_sib, &d->d_sib); p = &cursor->d_sib.next; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); } } spin_unlock(&dentry->d_lock); dput(last); return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; switch (whence) { case 1: offset += file->f_pos; fallthrough; case 0: if (offset >= 0) break; fallthrough; default: return -EINVAL; } if (offset != file->f_pos) { struct dentry *cursor = file->private_data; struct dentry *to = NULL; inode_lock_shared(dentry->d_inode); if (offset > 2) to = scan_positives(cursor, &dentry->d_children.first, offset - 2, NULL); spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (to) hlist_add_behind(&cursor->d_sib, &to->d_sib); spin_unlock(&dentry->d_lock); dput(to); file->f_pos = offset; inode_unlock_shared(dentry->d_inode); } return offset; } EXPORT_SYMBOL(dcache_dir_lseek); /* * Directory is locked and all positive dentries in it are safe, since * for ramfs-type trees they can't go away without unlink() or rmdir(), * both impossible due to the lock on directory. */ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; struct dentry *next = NULL; struct hlist_node **p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) p = &dentry->d_children.first; else p = &cursor->d_sib.next; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; p = &next->d_sib.next; } spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (next) hlist_add_before(&cursor->d_sib, &next->d_sib); spin_unlock(&dentry->d_lock); dput(next); return 0; } EXPORT_SYMBOL(dcache_readdir); ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { return -EISDIR; } EXPORT_SYMBOL(generic_read_dir); const struct file_operations simple_dir_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, .iterate_shared = dcache_readdir, .fsync = noop_fsync, }; EXPORT_SYMBOL(simple_dir_operations); const struct inode_operations simple_dir_inode_operations = { .lookup = simple_lookup, }; EXPORT_SYMBOL(simple_dir_inode_operations); /* simple_offset_add() never assigns these to a dentry */ enum { DIR_OFFSET_FIRST = 2, /* Find first real entry */ DIR_OFFSET_EOD = S32_MAX, }; /* simple_offset_add() allocation range */ enum { DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1, DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1, }; static void offset_set(struct dentry *dentry, long offset) { dentry->d_fsdata = (void *)offset; } static long dentry2offset(struct dentry *dentry) { return (long)dentry->d_fsdata; } static struct lock_class_key simple_offset_lock_class; /** * simple_offset_init - initialize an offset_ctx * @octx: directory offset map to be initialized * */ void simple_offset_init(struct offset_ctx *octx) { mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); octx->next_offset = DIR_OFFSET_MIN; } /** * simple_offset_add - Add an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: new dentry being added * * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { unsigned long offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, DIR_OFFSET_MAX, &octx->next_offset, GFP_KERNEL); if (unlikely(ret < 0)) return ret == -EBUSY ? -ENOSPC : ret; offset_set(dentry, offset); return 0; } static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry, long offset) { int ret; ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL); if (ret) return ret; offset_set(dentry, offset); return 0; } /** * simple_offset_remove - Remove an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: dentry being removed * */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { long offset; offset = dentry2offset(dentry); if (offset == 0) return; mtree_erase(&octx->mt, offset); offset_set(dentry, 0); } /** * simple_offset_rename - handle directory offsets for rename * @old_dir: parent directory of source entry * @old_dentry: dentry of source entry * @new_dir: parent_directory of destination entry * @new_dentry: dentry of destination * * Caller provides appropriate serialization. * * User space expects the directory offset value of the replaced * (new) directory entry to be unchanged after a rename. * * Returns zero on success, a negative errno value on failure. */ int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long new_offset = dentry2offset(new_dentry); simple_offset_remove(old_ctx, old_dentry); if (new_offset) { offset_set(new_dentry, 0); return simple_offset_replace(new_ctx, old_dentry, new_offset); } return simple_offset_add(new_ctx, old_dentry); } /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved * @old_dentry: dentry being moved * @new_dir: destination parent * @new_dentry: destination dentry * * This API preserves the directory offset values. Caller provides * appropriate serialization. * * Returns zero on success. Otherwise a negative errno is returned and the * rename is rolled back. */ int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long old_index = dentry2offset(old_dentry); long new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); simple_offset_remove(new_ctx, new_dentry); ret = simple_offset_replace(new_ctx, old_dentry, new_index); if (ret) goto out_restore; ret = simple_offset_replace(old_ctx, new_dentry, old_index); if (ret) { simple_offset_remove(new_ctx, old_dentry); goto out_restore; } ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (ret) { simple_offset_remove(new_ctx, old_dentry); simple_offset_remove(old_ctx, new_dentry); goto out_restore; } return 0; out_restore: (void)simple_offset_replace(old_ctx, old_dentry, old_index); (void)simple_offset_replace(new_ctx, new_dentry, new_index); return ret; } /** * simple_offset_destroy - Release offset map * @octx: directory offset ctx that is about to be destroyed * * During fs teardown (eg. umount), a directory's offset map might still * contain entries. xa_destroy() cleans out anything that remains. */ void simple_offset_destroy(struct offset_ctx *octx) { mtree_destroy(&octx->mt); } /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated * @offset: a byte offset * @whence: enumerator describing the starting position for this update * * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. * * Returns the updated read position if successful; otherwise a * negative errno is returned and the read position remains unchanged. */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset >= 0) break; fallthrough; default: return -EINVAL; } return vfs_setpos(file, offset, LONG_MAX); } static struct dentry *find_positive_dentry(struct dentry *parent, struct dentry *dentry, bool next) { struct dentry *found = NULL; spin_lock(&parent->d_lock); if (next) dentry = d_next_sibling(dentry); else if (!dentry) dentry = d_first_child(parent); hlist_for_each_entry_from(dentry, d_sib) { if (!simple_positive(dentry)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(dentry)) found = dget_dlock(dentry); spin_unlock(&dentry->d_lock); if (likely(found)) break; } spin_unlock(&parent->d_lock); return found; } static noinline_for_stack struct dentry * offset_dir_lookup(struct dentry *parent, loff_t offset) { struct inode *inode = d_inode(parent); struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *child, *found = NULL; MA_STATE(mas, &octx->mt, offset, offset); if (offset == DIR_OFFSET_FIRST) found = find_positive_dentry(parent, NULL, false); else { rcu_read_lock(); child = mas_find_rev(&mas, DIR_OFFSET_MIN); found = find_positive_dentry(parent, child, false); rcu_read_unlock(); } return found; } static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { struct inode *inode = d_inode(dentry); return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } static void offset_iterate_dir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; struct dentry *dentry; dentry = offset_dir_lookup(dir, ctx->pos); if (!dentry) goto out_eod; while (true) { struct dentry *next; ctx->pos = dentry2offset(dentry); if (!offset_dir_emit(ctx, dentry)) break; next = find_positive_dentry(dir, dentry, true); dput(dentry); if (!next) goto out_eod; dentry = next; } dput(dentry); return; out_eod: ctx->pos = DIR_OFFSET_EOD; } /** * offset_readdir - Emit entries starting at offset @ctx->pos * @file: an open directory to iterate over * @ctx: directory iteration context * * Caller must hold @file's i_rwsem to prevent insertion or removal of * entries during this call. * * On entry, @ctx->pos contains an offset that represents the first entry * to be read from the directory. * * The operation continues until there are no more entries to read, or * until the ctx->actor indicates there is no more space in the caller's * output buffer. * * On return, @ctx->pos contains an offset that will read the next entry * in this directory when offset_readdir() is called again with @ctx. * Caller places this value in the d_off field of the last entry in the * user's buffer. * * Return values: * %0 - Complete */ static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos != DIR_OFFSET_EOD) offset_iterate_dir(file, ctx); return 0; } const struct file_operations simple_offset_dir_operations = { .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, .fsync = noop_fsync, }; static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL, *d; spin_lock(&parent->d_lock); d = prev ? d_next_sibling(prev) : d_first_child(parent); hlist_for_each_entry_from(d, d_sib) { if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) child = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(child)) break; } } spin_unlock(&parent->d_lock); dput(prev); return child; } void simple_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) { struct dentry *this = dget(dentry); while (true) { struct dentry *victim = NULL, *child; struct inode *inode = this->d_inode; inode_lock(inode); if (d_is_dir(this)) inode->i_flags |= S_DEAD; while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked inode_set_ctime_current(inode); clear_nlink(inode); inode_unlock(inode); victim = this; this = this->d_parent; inode = this->d_inode; inode_lock(inode); if (simple_positive(victim)) { d_invalidate(victim); // avoid lost mounts if (d_is_dir(victim)) fsnotify_rmdir(inode, victim); else fsnotify_unlink(inode, victim); if (callback) callback(victim); dput(victim); // unpin it } if (victim == dentry) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); dput(dentry); return; } } inode_unlock(inode); this = child; } } EXPORT_SYMBOL(simple_recursive_removal); static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) { struct pseudo_fs_context *ctx = fc->fs_private; struct inode *root; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); if (!root) return -ENOMEM; /* * since this is the first inode, make it number 1. New inodes created * after this must take care not to collide with it (by passing * max_reserved of 1 to iunique). */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; simple_inode_init_ts(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; s->s_d_op = ctx->dops; return 0; } static int pseudo_fs_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, pseudo_fs_fill_super); } static void pseudo_fs_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations pseudo_fs_context_ops = { .free = pseudo_fs_free, .get_tree = pseudo_fs_get_tree, }; /* * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ struct pseudo_fs_context *init_pseudo(struct fs_context *fc, unsigned long magic) { struct pseudo_fs_context *ctx; ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); if (likely(ctx)) { ctx->magic = magic; fc->fs_private = ctx; fc->ops = &pseudo_fs_context_ops; fc->sb_flags |= SB_NOUSER; fc->global = true; } return ctx; } EXPORT_SYMBOL(init_pseudo); int simple_open(struct inode *inode, struct file *file) { if (inode->i_private) file->private_data = inode->i_private; return 0; } EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inc_nlink(inode); ihold(inode); dget(dentry); d_instantiate(dentry, inode); return 0; } EXPORT_SYMBOL(simple_link); int simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; spin_lock(&dentry->d_lock); hlist_for_each_entry(child, &dentry->d_children, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); goto out; } spin_unlock(&child->d_lock); } ret = 1; out: spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); dput(dentry); return 0; } EXPORT_SYMBOL(simple_unlink); int simple_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; } EXPORT_SYMBOL(simple_rmdir); /** * simple_rename_timestamp - update the various inode timestamps for rename * @old_dir: old parent directory * @old_dentry: dentry that is being renamed * @new_dir: new parent directory * @new_dentry: target for rename * * POSIX mandates that the old and new parent directories have their ctime and * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have * their ctime updated. */ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *newino = d_inode(new_dentry); inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (new_dir != old_dir) inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); inode_set_ctime_current(d_inode(old_dentry)); if (newino) inode_set_ctime_current(newino); } EXPORT_SYMBOL_GPL(simple_rename_timestamp); int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { bool old_is_dir = d_is_dir(old_dentry); bool new_is_dir = d_is_dir(new_dentry); if (old_dir != new_dir && old_is_dir != new_is_dir) { if (old_is_dir) { drop_nlink(old_dir); inc_nlink(new_dir); } else { drop_nlink(new_dir); inc_nlink(old_dir); } } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem * @idmap: idmap of the target mount * @dentry: dentry * @iattr: iattr structure * * Returns 0 on success, -error on failure. * * simple_setattr is a simple ->setattr implementation without a proper * implementation of size changes. * * It can either be used for in-memory filesystems or special files * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; error = setattr_prepare(idmap, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); setattr_copy(idmap, inode, iattr); mark_inode_dirty(inode); return 0; } EXPORT_SYMBOL(simple_setattr); static int simple_read_folio(struct file *file, struct folio *folio) { folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); folio_unlock(folio); return 0; } int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct folio *folio; folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); *foliop = folio; if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + len, folio_size(folio)); } return 0; } EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes * @file: See .write_end of address_space_operations * @mapping: " * @pos: " * @len: " * @copied: " * @folio: " * @fsdata: " * * simple_write_end does the minimum needed for updating a folio after * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_mutex is assumed to be held. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. * * Use *ONLY* with simple_read_folio() */ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; /* zero the stale part of the folio if we did a short copy */ if (!folio_test_uptodate(folio)) { if (copied < len) { size_t from = offset_in_folio(folio, pos); folio_zero_range(folio, from + copied, len - copied); } folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } /* * Provides ramfs-style behavior: data in the pagecache, but no writeback. */ const struct address_space_operations ram_aops = { .read_folio = simple_read_folio, .write_begin = simple_write_begin, .write_end = simple_write_end, .dirty_folio = noop_dirty_folio, }; EXPORT_SYMBOL(ram_aops); /* * the inodes created here are not hashed. If you use iunique to generate * unique inode values later for this filesystem, then you must take care * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; struct dentry *dentry; int i; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = &simple_super_operations; s->s_time_gran = 1; inode = new_inode(s); if (!inode) return -ENOMEM; /* * because the root inode is 1, the files array must not contain an * entry at index 1 */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; simple_inode_init_ts(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); s->s_root = d_make_root(inode); if (!s->s_root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) continue; /* warn if it tries to conflict with the root inode */ if (unlikely(i == 1)) printk(KERN_WARNING "%s: %s passed in a files array" "with an index of 1!\n", __func__, s->s_type->name); dentry = d_alloc_name(s->s_root, files->name); if (!dentry) return -ENOMEM; inode = new_inode(s); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_mode = S_IFREG | files->mode; simple_inode_init_ts(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); } return 0; } EXPORT_SYMBOL(simple_fill_super); static DEFINE_SPINLOCK(pin_fs_lock); int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) { struct vfsmount *mnt = NULL; spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); if (!*mount) *mount = mnt; } mntget(*mount); ++*count; spin_unlock(&pin_fs_lock); mntput(mnt); return 0; } EXPORT_SYMBOL(simple_pin_fs); void simple_release_fs(struct vfsmount **mount, int *count) { struct vfsmount *mnt; spin_lock(&pin_fs_lock); mnt = *mount; if (!--*count) *mount = NULL; spin_unlock(&pin_fs_lock); mntput(mnt); } EXPORT_SYMBOL(simple_release_fs); /** * simple_read_from_buffer - copy data from the buffer to user space * @to: the user space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The simple_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the user space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; size_t ret; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; ret = copy_to_user(to, from + pos, count); if (ret == count) return -EFAULT; count -= ret; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_read_from_buffer); /** * simple_write_to_buffer - copy data from user space to the buffer * @to: the buffer to write to * @available: the size of the buffer * @ppos: the current position in the buffer * @from: the user space buffer to read from * @count: the maximum number of bytes to read * * The simple_write_to_buffer() function reads up to @count bytes from the user * space address starting at @from into the buffer @to at offset @ppos. * * On success, the number of bytes written is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count) { loff_t pos = *ppos; size_t res; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; res = copy_from_user(to + pos, from, count); if (res == count) return -EFAULT; count -= res; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_write_to_buffer); /** * memory_read_from_buffer - copy data from the buffer * @to: the kernel space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The memory_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the kernel space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; if (pos < 0) return -EINVAL; if (pos >= available) return 0; if (count > available - pos) count = available - pos; memcpy(to, from + pos, count); *ppos = pos + count; return count; } EXPORT_SYMBOL(memory_read_from_buffer); /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then * possibly a read which collects the result - which is stored in a * file-local buffer. */ void simple_transaction_set(struct file *file, size_t n) { struct simple_transaction_argresp *ar = file->private_data; BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); /* * The barrier ensures that ar->size will really remain zero until * ar->data is ready for reading. */ smp_mb(); ar->size = n; } EXPORT_SYMBOL(simple_transaction_set); char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; static DEFINE_SPINLOCK(simple_transaction_lock); if (size > SIMPLE_TRANSACTION_LIMIT - 1) return ERR_PTR(-EFBIG); ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); if (!ar) return ERR_PTR(-ENOMEM); spin_lock(&simple_transaction_lock); /* only one write allowed per open */ if (file->private_data) { spin_unlock(&simple_transaction_lock); free_page((unsigned long)ar); return ERR_PTR(-EBUSY); } file->private_data = ar; spin_unlock(&simple_transaction_lock); if (copy_from_user(ar->data, buf, size)) return ERR_PTR(-EFAULT); return ar->data; } EXPORT_SYMBOL(simple_transaction_get); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { struct simple_transaction_argresp *ar = file->private_data; if (!ar) return 0; return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); } EXPORT_SYMBOL(simple_transaction_read); int simple_transaction_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); return 0; } EXPORT_SYMBOL(simple_transaction_release); /* Simple attribute files */ struct simple_attr { int (*get)(void *, u64 *); int (*set)(void *, u64); char get_buf[24]; /* enough to store a u64 and "\n\0" */ char set_buf[24]; void *data; const char *fmt; /* format for read operation */ struct mutex mutex; /* protects access to these buffers */ }; /* simple_attr_open is called by an actual attribute open file operation * to set the attribute specific access operations. */ int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { struct simple_attr *attr; attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; attr->get = get; attr->set = set; attr->data = inode->i_private; attr->fmt = fmt; mutex_init(&attr->mutex); file->private_data = attr; return nonseekable_open(inode, file); } EXPORT_SYMBOL_GPL(simple_attr_open); int simple_attr_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ /* read from the buffer that is filled with the get function */ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct simple_attr *attr; size_t size; ssize_t ret; attr = file->private_data; if (!attr->get) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; if (*ppos && attr->get_buf[0]) { /* continued read */ size = strlen(attr->get_buf); } else { /* first read */ u64 val; ret = attr->get(attr->data, &val); if (ret) goto out; size = scnprintf(attr->get_buf, sizeof(attr->get_buf), attr->fmt, (unsigned long long)val); } ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); out: mutex_unlock(&attr->mutex); return ret; } EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; size_t size; ssize_t ret; attr = file->private_data; if (!attr->set) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; ret = -EFAULT; size = min(sizeof(attr->set_buf) - 1, len); if (copy_from_user(attr->set_buf, buf, size)) goto out; attr->set_buf[size] = '\0'; if (is_signed) ret = kstrtoll(attr->set_buf, 0, &val); else ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ out: mutex_unlock(&attr->mutex); return ret; } ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(simple_attr_write); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(simple_attr_write_signed); /** * generic_encode_ino32_fh - generic export_operations->encode_fh function * @inode: the object to encode * @fh: where to store the file handle fragment * @max_len: maximum length to store there (in 4 byte units) * @parent: parent directory inode, if wanted * * This generic encode_fh function assumes that the 32 inode number * is suitable for locating an inode, and that the generation number * can be used to check that it is still valid. It places them in the * filehandle fragment where export_decode_fh expects to find them. */ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct fid *fid = (void *)fh; int len = *max_len; int type = FILEID_INO32_GEN; if (parent && (len < 4)) { *max_len = 4; return FILEID_INVALID; } else if (len < 2) { *max_len = 2; return FILEID_INVALID; } len = 2; fid->i32.ino = inode->i_ino; fid->i32.gen = inode->i_generation; if (parent) { fid->i32.parent_ino = parent->i_ino; fid->i32.parent_gen = parent->i_generation; len = 4; type = FILEID_INO32_GEN_PARENT; } *max_len = len; return type; } EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the object specified in the file handle. */ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.ino, fid->i32.gen); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_dentry); /** * generic_fh_to_parent - generic helper for the fh_to_parent export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the _parent_ object specified in the file handle if it * is specified in the file handle, or NULL otherwise. */ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len <= 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.parent_ino, (fh_len > 3 ? fid->i32.parent_gen : 0)); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** * __generic_file_fsync - generic fsync implementation for simple filesystems * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; int ret; err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; out: inode_unlock(inode); /* check and advance again to catch errors after syncing out buffers */ err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); /** * generic_file_fsync - generic fsync implementation for simple filesystems * with flush * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * */ int generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; err = __generic_file_fsync(file, start, end, datasync); if (err) return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } EXPORT_SYMBOL(generic_file_fsync); /** * generic_check_addressable - Check addressability of file system * @blocksize_bits: log of file system block size * @num_blocks: number of blocks in file system * * Determine whether a file system with @num_blocks blocks (and a * block size of 2**@blocksize_bits) is addressable by the sector_t * and page cache of the system. Return 0 if so and -EFBIG otherwise. */ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; u64 last_fs_page = last_fs_block >> (PAGE_SHIFT - blocksize_bits); if (unlikely(num_blocks == 0)) return 0; if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { return -EFBIG; } return 0; } EXPORT_SYMBOL(generic_check_addressable); /* * No-op implementation of ->fsync for in-memory filesystems. */ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return 0; } EXPORT_SYMBOL(noop_fsync); ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* * iomap based filesystems support direct I/O without need for * this callback. However, it still needs to be set in * inode->a_ops so that open/fcntl know that direct I/O is * generally supported. */ return -EINVAL; } EXPORT_SYMBOL_GPL(noop_direct_IO); /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { kfree(p); } EXPORT_SYMBOL(kfree_link); struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { .dirty_folio = noop_dirty_folio, }; struct inode *inode = new_inode_pseudo(s); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &anon_aops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; simple_inode_init_ts(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); /** * simple_nosetlease - generic helper for prohibiting leases * @filp: file pointer * @arg: type of lease to obtain * @flp: new lease supplied for insertion * @priv: private data for lm_setup operation * * Generic helper for filesystems that do not wish to allow leases to be set. * All arguments are ignored and it just returns -EINVAL. */ int simple_nosetlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; } EXPORT_SYMBOL(simple_nosetlease); /** * simple_get_link - generic helper to get the target of "fast" symlinks * @dentry: not used here * @inode: the symlink inode * @done: not used here * * Generic helper for filesystems to use for symlink inodes where a pointer to * the symlink target is stored in ->i_link. NOTE: this isn't normally called, * since as an optimization the path lookup code uses any non-NULL ->i_link * directly, without calling ->get_link(). But ->get_link() still must be set, * to mark the inode_operations as being for a symlink. * * Return: the symlink target */ const char *simple_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return inode->i_link; } EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { .get_link = simple_get_link, }; EXPORT_SYMBOL(simple_symlink_inode_operations); /* * Operations for a permanently empty directory. */ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-ENOENT); } static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EPERM; } static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) { return -EOPNOTSUPP; } static const struct inode_operations empty_dir_inode_operations = { .lookup = empty_dir_lookup, .setattr = empty_dir_setattr, .listxattr = empty_dir_listxattr, }; static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) { /* An empty directory has two entries . and .. at offsets 0 and 1 */ return generic_file_llseek_size(file, offset, whence, 2, 2); } static int empty_dir_readdir(struct file *file, struct dir_context *ctx) { dir_emit_dots(file, ctx); return 0; } static const struct file_operations empty_dir_operations = { .llseek = empty_dir_llseek, .read = generic_read_dir, .iterate_shared = empty_dir_readdir, .fsync = noop_fsync, }; void make_empty_dir_inode(struct inode *inode) { set_nlink(inode, 2); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_rdev = 0; inode->i_size = 0; inode->i_blkbits = PAGE_SHIFT; inode->i_blocks = 0; inode->i_op = &empty_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &empty_dir_operations; } bool is_empty_dir_inode(struct inode *inode) { return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } #if IS_ENABLED(CONFIG_UNICODE) /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against * @len: len of name of dentry * @str: str pointer to name of dentry * @name: Name to compare against * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const struct dentry *parent; const struct inode *dir; union shortname_store strbuf; struct qstr qstr; /* * Attempt a case-sensitive match first. It is cheaper and * should cover most lookups, including all the sane * applications that expect a case-sensitive filesystem. * * This comparison is safe under RCU because the caller * guarantees the consistency between str and len. See * __d_lookup_rcu_op_compare() for details. */ if (len == name->len && !memcmp(str, name->name, len)) return 0; parent = READ_ONCE(dentry->d_parent); dir = READ_ONCE(parent->d_inode); if (!dir || !IS_CASEFOLDED(dir)) return 1; qstr.len = len; qstr.name = str; /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry * the lookup, so it doesn't matter what ->d_compare() returns. * However, it's unsafe to call utf8_strncasecmp() with an unstable * string. Therefore, we have to copy the name into a temporary buffer. * As above, len is guaranteed to match str, so the shortname case * is exactly when str points to ->d_shortname. */ if (qstr.name == dentry->d_shortname.string) { strbuf = dentry->d_shortname; // NUL is guaranteed to be in there qstr.name = strbuf.string; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems * @dentry: dentry of the parent directory * @str: qstr of name whose hash we should fill in * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; int ret; if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); if (ret < 0 && sb_has_strict_encoding(sb)) return -EINVAL; return 0; } EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, #ifdef CONFIG_FS_ENCRYPTION .d_revalidate = fscrypt_d_revalidate, #endif }; /** * generic_ci_match() - Match a name (case-insensitively) with a dirent. * This is a filesystem helper for comparison with directory entries. * generic_ci_d_compare should be used in VFS' ->d_compare instead. * * @parent: Inode of the parent of the dirent under comparison * @name: name under lookup. * @folded_name: Optional pre-folded name under lookup * @de_name: Dirent name. * @de_name_len: dirent name length. * * Test whether a case-insensitive directory entry matches the filename * being searched. If @folded_name is provided, it is used instead of * recalculating the casefold of @name. * * Return: > 0 if the directory entry matches, 0 if it doesn't match, or * < 0 on error. */ int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len) { const struct super_block *sb = parent->i_sb; const struct unicode_map *um = sb->s_encoding; struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); struct qstr dirent = QSTR_INIT(de_name, de_name_len); int res = 0; if (IS_ENCRYPTED(parent)) { const struct fscrypt_str encrypted_name = FSTR_INIT((u8 *) de_name, de_name_len); if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent))) return -EINVAL; decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); if (!decrypted_name.name) return -ENOMEM; res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, &decrypted_name); if (res < 0) { kfree(decrypted_name.name); return res; } dirent.name = decrypted_name.name; dirent.len = decrypted_name.len; } /* * Attempt a case-sensitive match first. It is cheaper and * should cover most lookups, including all the sane * applications that expect a case-sensitive filesystem. */ if (dirent.len == name->len && !memcmp(name->name, dirent.name, dirent.len)) goto out; if (folded_name->name) res = utf8_strncasecmp_folded(um, folded_name, &dirent); else res = utf8_strncasecmp(um, name, &dirent); out: kfree(decrypted_name.name); if (res < 0 && sb_has_strict_encoding(sb)) { pr_err_ratelimited("Directory contains filename that is invalid UTF-8"); return 0; } return !res; } EXPORT_SYMBOL(generic_ci_match); #endif #ifdef CONFIG_FS_ENCRYPTION static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, }; #endif /** * generic_set_sb_d_ops - helper for choosing the set of * filesystem-wide dentry operations for the enabled features * @sb: superblock to be configured * * Filesystems supporting casefolding and/or fscrypt can call this * helper at mount-time to configure sb->s_d_op to best set of dentry * operations required for the enabled features. The helper must be * called after these have been configured, but before the root dentry * is created. */ void generic_set_sb_d_ops(struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) { sb->s_d_op = &generic_ci_dentry_ops; return; } #endif #ifdef CONFIG_FS_ENCRYPTION if (sb->s_cop) { sb->s_d_op = &generic_encrypted_dentry_ops; return; } #endif } EXPORT_SYMBOL(generic_set_sb_d_ops); /** * inode_maybe_inc_iversion - increments i_version * @inode: inode with the i_version that should be updated * @force: increment the counter even if it's not necessary? * * Every time the inode is modified, the i_version field must be seen to have * changed by any observer. * * If "force" is set or the QUERIED flag is set, then ensure that we increment * the value, and clear the queried flag. * * In the common case where neither is set, then we can return "false" without * updating i_version. * * If this function returns false, and no other metadata has changed, then we * can avoid logging the metadata. */ bool inode_maybe_inc_iversion(struct inode *inode, bool force) { u64 cur, new; /* * The i_version field is not strictly ordered with any other inode * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * * We add a full memory barrier to ensure that any de facto ordering * with other state is preserved (either implicitly coming from cmpxchg * or explicitly from smp_mb if we don't know upfront if we will execute * the former). * * These barriers pair with inode_query_iversion(). */ cur = inode_peek_iversion_raw(inode); if (!force && !(cur & I_VERSION_QUERIED)) { smp_mb(); cur = inode_peek_iversion_raw(inode); } do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); /** * inode_query_iversion - read i_version for later use * @inode: inode from which i_version should be read * * Read the inode i_version counter. This should be used by callers that wish * to store the returned i_version for later comparison. This will guarantee * that a later query of the i_version will result in a different value if * anything has changed. * * In this implementation, we fetch the current value, set the QUERIED flag and * then try to swap it into place with a cmpxchg, if it wasn't already set. If * that fails, we try again with the newly fetched value from the cmpxchg. */ u64 inode_query_iversion(struct inode *inode) { u64 cur, new; bool fenced = false; /* * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with * inode_maybe_inc_iversion(), see that routine for more details. */ cur = inode_peek_iversion_raw(inode); do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { if (!fenced) smp_mb(); break; } fenced = true; new = cur | I_VERSION_QUERIED; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos - buffered_written; loff_t end = iocb->ki_pos - 1; int err; /* * If the buffered write fallback returned an error, we want to return * the number of bytes which were written by direct I/O, or the error * code if that was zero. * * Note that this differs from normal direct-io semantics, which will * return -EFOO even if some bytes were written. */ if (unlikely(buffered_written < 0)) { if (direct_written) return direct_written; return buffered_written; } /* * We need to ensure that the page cache pages are written to disk and * invalidated to preserve the expected O_DIRECT semantics. */ err = filemap_write_and_wait_range(mapping, pos, end); if (err < 0) { /* * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; } invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); return direct_written + buffered_written; } EXPORT_SYMBOL_GPL(direct_write_fallback); /** * simple_inode_init_ts - initialize the timestamps for a new inode * @inode: inode to be initialized * * When a new inode is created, most filesystems set the timestamps to the * current time. Add a helper to do this. */ struct timespec64 simple_inode_init_ts(struct inode *inode) { struct timespec64 ts = inode_set_ctime_current(inode); inode_set_atime_to_ts(inode, ts); inode_set_mtime_to_ts(inode, ts); return ts; } EXPORT_SYMBOL(simple_inode_init_ts); struct dentry *stashed_dentry_get(struct dentry **stashed) { struct dentry *dentry; guard(rcu)(); dentry = rcu_dereference(*stashed); if (!dentry) return NULL; if (!lockref_get_not_dead(&dentry->d_lockref)) return NULL; return dentry; } static struct dentry *prepare_anon_dentry(struct dentry **stashed, struct super_block *sb, void *data) { struct dentry *dentry; struct inode *inode; const struct stashed_operations *sops = sb->s_fs_info; int ret; inode = new_inode_pseudo(sb); if (!inode) { sops->put_data(data); return ERR_PTR(-ENOMEM); } inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG; simple_inode_init_ts(inode); ret = sops->init_inode(inode, data); if (ret < 0) { iput(inode); return ERR_PTR(ret); } /* Notice when this is changed. */ WARN_ON_ONCE(!S_ISREG(inode->i_mode)); WARN_ON_ONCE(!IS_IMMUTABLE(inode)); dentry = d_alloc_anon(sb); if (!dentry) { iput(inode); return ERR_PTR(-ENOMEM); } /* Store address of location where dentry's supposed to be stashed. */ dentry->d_fsdata = stashed; /* @data is now owned by the fs */ d_instantiate(dentry, inode); return dentry; } static struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry) { guard(rcu)(); for (;;) { struct dentry *old; /* Assume any old dentry was cleared out. */ old = cmpxchg(stashed, NULL, dentry); if (likely(!old)) return dentry; /* Check if somebody else installed a reusable dentry. */ if (lockref_get_not_dead(&old->d_lockref)) return old; /* There's an old dead dentry there, try to take it over. */ if (likely(try_cmpxchg(stashed, &old, dentry))) return dentry; } } /** * path_from_stashed - create path from stashed or new dentry * @stashed: where to retrieve or stash dentry * @mnt: mnt of the filesystems to use * @data: data to store in inode->i_private * @path: path to create * * The function tries to retrieve a stashed dentry from @stashed. If the dentry * is still valid then it will be reused. If the dentry isn't able the function * will allocate a new dentry and inode. It will then check again whether it * can reuse an existing dentry in case one has been added in the meantime or * update @stashed with the newly added dentry. * * Special-purpose helper for nsfs and pidfs. * * Return: On success zero and on failure a negative error is returned. */ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path) { struct dentry *dentry; const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ path->dentry = stashed_dentry_get(stashed); if (path->dentry) { sops->put_data(data); goto out_path; } /* Allocate a new dentry. */ dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data); if (IS_ERR(dentry)) return PTR_ERR(dentry); /* Added a new dentry. @data is now owned by the filesystem. */ path->dentry = stash_dentry(stashed, dentry); if (path->dentry != dentry) dput(dentry); out_path: WARN_ON_ONCE(path->dentry->d_fsdata != stashed); WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); path->mnt = mntget(mnt); return 0; } void stashed_dentry_prune(struct dentry *dentry) { struct dentry **stashed = dentry->d_fsdata; struct inode *inode = d_inode(dentry); if (WARN_ON_ONCE(!stashed)) return; if (!inode) return; /* * Only replace our own @dentry as someone else might've * already cleared out @dentry and stashed their own * dentry in there. */ cmpxchg(stashed, dentry, NULL); }
105 737 737 360 594 173 173 22 173 641 641 25 208 208 642 642 800 378 787 601 793 716 53 53 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 /* SPDX-License-Identifier: GPL-2.0 */ /* * Variant of atomic_t specialized for reference counts. * * The interface matches the atomic_t interface (to aid in porting) but only * provides the few functions one should use for reference counting. * * Saturation semantics * ==================== * * refcount_t differs from atomic_t in that the counter saturates at * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the * counter and causing 'spurious' use-after-free issues. In order to avoid the * cost associated with introducing cmpxchg() loops into all of the saturating * operations, we temporarily allow the counter to take on an unchecked value * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow * or overflow has occurred. Although this is racy when multiple threads * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly * equidistant from 0 and INT_MAX we minimise the scope for error: * * INT_MAX REFCOUNT_SATURATED UINT_MAX * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) * +--------------------------------+----------------+----------------+ * <---------- bad value! ----------> * * (in a signed view of the world, the "bad value" range corresponds to * a negative counter value). * * As an example, consider a refcount_inc() operation that causes the counter * to overflow: * * int old = atomic_fetch_add_relaxed(r); * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) * if (old < 0) * atomic_set(r, REFCOUNT_SATURATED); * * If another thread also performs a refcount_inc() operation between the two * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the * underlying object. * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). * With the current PID limit, if no batched refcounting operations are used and * the attacker can't repeatedly trigger kernel oopses in the middle of refcount * operations, this makes it impossible for a saturated refcount to leave the * saturation range, even if it is possible for multiple uses of the same * refcount to nest in the context of a single task: * * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = * 0x40000000 / 0x400000 = 0x100 = 256 * * If hundreds of references are added/removed with a single refcounting * operation, it may potentially be possible to leave the saturation range; but * given the precise timing details involved with the round-robin scheduling of * each thread manipulating the refcount and the need to hit the race multiple * times in succession, there doesn't appear to be a practical avenue of attack * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object we're increasing the * reference count on will provide the ordering. For locked data structures, * its the lock acquire, for RCU/lockless data structures its the dependent * load. * * Do note that inc_not_zero() provides a control dependency which will order * future stores against the inc, this ensures we'll never modify the object * if we did not in fact acquire a reference. * * The decrements will provide release order, such that all the prior loads and * stores will be issued before, it also provides a control dependency, which * will order us against the subsequent free(). * * The control dependency is against the load of the cmpxchg (ll/sc) that * succeeded. This means the stores aren't fully ordered, but this is fine * because the 1->0 transition indicates no concurrency. * * Note that the allocator is responsible for ordering things between free() * and alloc(). * * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide * acquire and release ordering for cases when the memory occupied by the * object might be reused to store another object. This is important for the * cases where secondary validation is required to detect such reuse, e.g. * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after * the refcount is taken, hence acquire order is necessary. Similarly, when the * object is initialized, all stores to its attributes should be visible before * the refcount is set, otherwise a stale attribute value might be used by * another task which succeeds in taking a refcount to the new object. */ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> #include <linux/refcount_types.h> #include <linux/spinlock_types.h> struct mutex; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) enum refcount_saturation_type { REFCOUNT_ADD_NOT_ZERO_OVF, REFCOUNT_ADD_OVF, REFCOUNT_ADD_UAF, REFCOUNT_SUB_UAF, REFCOUNT_DEC_LEAK, }; void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } /** * refcount_set_release - set a refcount's value with release ordering * @r: the refcount * @n: value to which the refcount will be set * * This function should be used when memory occupied by the object might be * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. * * Provides release memory ordering which will order previous memory operations * against this store. This ensures all updates to this object are visible * once the refcount is set and stale values from the object previously * occupying this memory are overwritten with new ones. * * This function should be called only after new object is fully initialized. * After this call the object should be considered visible to other tasks even * if it was not yet added into an object collection normally used to discover * it. This is because other tasks might have discovered the object previously * occupying the same memory and after memory reuse they can succeed in taking * refcount to the new object and start using it. */ static inline void refcount_set_release(refcount_t *r, int n) { atomic_set_release(&r->refs, n); } /** * refcount_read - get a refcount's value * @r: the refcount * * Return: the refcount's value */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); } static inline __must_check __signed_wrap bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); do { if (!old) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) { return __refcount_add_not_zero(i, r, NULL); } static inline __must_check __signed_wrap bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp, int limit) { int old = refcount_read(r); do { if (!old) break; if (i > limit - old) { if (oldp) *oldp = old; return false; } } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } static inline __must_check bool __refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit) { return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit); } static inline __must_check __signed_wrap bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) { return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX); } /** * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 * * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * This function should be used when memory occupied by the object might be * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. * * Provides acquire memory ordering on success, it is assumed the caller has * guaranteed the object memory to be stable (RCU, etc.). It does provide a * control dependency and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc_not_zero_acquire() should instead be used to increment a * reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r) { return __refcount_add_not_zero_acquire(i, r, NULL); } static inline __signed_wrap void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); if (oldp) *oldp = old; if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount * @r: the refcount * * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); } /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment * * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED * and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { return __refcount_inc_not_zero(r, NULL); } static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp) { return __refcount_add_not_zero_acquire(1, r, oldp); } /** * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0 * @r: the refcount to increment * * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on * success. * * This function should be used when memory occupied by the object might be * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. * * Provides acquire memory ordering on success, it is assumed the caller has * guaranteed the object memory to be stable (RCU, etc.). It does provide a * control dependency and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r) { return __refcount_inc_not_zero_acquire(r, NULL); } static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); } /** * refcount_inc - increment a refcount * @r: the refcount to increment * * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller already has a * reference on the object. * * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ static inline void refcount_inc(refcount_t *r) { __refcount_inc(r, NULL); } static inline __must_check __signed_wrap bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); if (oldp) *oldp = old; if (old > 0 && old == i) { smp_acquire__after_ctrl_dep(); return true; } if (unlikely(old <= 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; } /** * refcount_sub_and_test - subtract from a refcount and test if it is 0 * @i: amount to subtract from the refcount * @r: the refcount * * Similar to atomic_dec_and_test(), but it will WARN, return false and * ultimately leak on underflow and will fail to decrement when saturated * at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_dec(), or one of its variants, should instead be used to * decrement a reference count. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { return __refcount_sub_and_test(i, r, NULL); } static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { return __refcount_sub_and_test(1, r, oldp); } /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); if (oldp) *oldp = old; if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } /** * refcount_dec - decrement a refcount * @r: the refcount * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before. */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) __cond_acquires(lock); #endif /* _LINUX_REFCOUNT_H */
1 501 304 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_LOCK_H_ #define _ASM_GENERIC_BITOPS_LOCK_H_ #include <linux/atomic.h> #include <linux/compiler.h> #include <asm/barrier.h> /** * arch_test_and_set_bit_lock - Set a bit and return its old value, for lock * @nr: Bit to set * @addr: Address to count from * * This operation is atomic and provides acquire barrier semantics if * the returned value is 0. * It can be used to implement bit locks. */ static __always_inline int arch_test_and_set_bit_lock(unsigned int nr, volatile unsigned long *p) { long old; unsigned long mask = BIT_MASK(nr); p += BIT_WORD(nr); if (READ_ONCE(*p) & mask) return 1; old = raw_atomic_long_fetch_or_acquire(mask, (atomic_long_t *)p); return !!(old & mask); } /** * arch_clear_bit_unlock - Clear a bit in memory, for unlock * @nr: the bit to set * @addr: the address to start counting from * * This operation is atomic and provides release barrier semantics. */ static __always_inline void arch_clear_bit_unlock(unsigned int nr, volatile unsigned long *p) { p += BIT_WORD(nr); raw_atomic_long_fetch_andnot_release(BIT_MASK(nr), (atomic_long_t *)p); } /** * arch___clear_bit_unlock - Clear a bit in memory, for unlock * @nr: the bit to set * @addr: the address to start counting from * * A weaker form of clear_bit_unlock() as used by __bit_lock_unlock(). If all * the bits in the word are protected by this lock some archs can use weaker * ops to safely unlock. * * See for example x86's implementation. */ static inline void arch___clear_bit_unlock(unsigned int nr, volatile unsigned long *p) { unsigned long old; p += BIT_WORD(nr); old = READ_ONCE(*p); old &= ~BIT_MASK(nr); raw_atomic_long_set_release((atomic_long_t *)p, old); } #ifndef arch_xor_unlock_is_negative_byte static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *p) { long old; old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p); return !!(old & BIT(7)); } #endif #include <asm-generic/bitops/instrumented-lock.h> #endif /* _ASM_GENERIC_BITOPS_LOCK_H_ */
4 4 4 4 199 198 199 198 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support to import/export the MLS label from NetLabel * Copyright (C) Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <net/netlabel.h> #include "sidtab.h" #include "mls.h" #include "policydb.h" #include "services.h" /* * Return the length in bytes for the MLS fields of the * security context string representation of `context'. */ int mls_compute_context_len(struct policydb *p, struct context *context) { int i, l, len, head, prev; char *nm; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return 0; len = 1; /* for the beginning ":" */ for (l = 0; l < 2; l++) { u32 index_sens = context->range.level[l].sens; len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1)); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (head != prev) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } nm = sym_name(p, SYM_CATS, i); len += strlen(nm) + 1; head = i; } prev = i; } if (prev != head) { nm = sym_name(p, SYM_CATS, prev); len += strlen(nm) + 1; } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else len++; } } return len; } /* * Write the security context string representation of * the MLS fields of `context' into the string `*scontext'. * Update `*scontext' to point to the end of the MLS fields. */ void mls_sid_to_context(struct policydb *p, struct context *context, char **scontext) { char *scontextp, *nm; int i, l, head, prev; struct ebitmap *e; struct ebitmap_node *node; if (!p->mls_enabled) return; scontextp = *scontext; *scontextp = ':'; scontextp++; for (l = 0; l < 2; l++) { strcpy(scontextp, sym_name(p, SYM_LEVELS, context->range.level[l].sens - 1)); scontextp += strlen(scontextp); /* categories */ head = -2; prev = -2; e = &context->range.level[l].cat; ebitmap_for_each_positive_bit(e, node, i) { if (i - prev > 1) { /* one or more negative bits are skipped */ if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (prev < 0) *scontextp++ = ':'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, i); strcpy(scontextp, nm); scontextp += strlen(nm); head = i; } prev = i; } if (prev != head) { if (prev - head > 1) *scontextp++ = '.'; else *scontextp++ = ','; nm = sym_name(p, SYM_CATS, prev); strcpy(scontextp, nm); scontextp += strlen(nm); } if (l == 0) { if (mls_level_eq(&context->range.level[0], &context->range.level[1])) break; else *scontextp++ = '-'; } } *scontext = scontextp; } int mls_level_isvalid(struct policydb *p, struct mls_level *l) { struct level_datum *levdatum; if (!l->sens || l->sens > p->p_levels.nprim) return 0; levdatum = symtab_search(&p->p_levels, sym_name(p, SYM_LEVELS, l->sens - 1)); if (!levdatum) return 0; /* * Return 1 iff all the bits set in l->cat are also be set in * levdatum->level->cat and no bit in l->cat is larger than * p->p_cats.nprim. */ return ebitmap_contains(&levdatum->level.cat, &l->cat, p->p_cats.nprim); } int mls_range_isvalid(struct policydb *p, struct mls_range *r) { return (mls_level_isvalid(p, &r->level[0]) && mls_level_isvalid(p, &r->level[1]) && mls_level_dom(&r->level[1], &r->level[0])); } /* * Return 1 if the MLS fields in the security context * structure `c' are valid. Return 0 otherwise. */ int mls_context_isvalid(struct policydb *p, struct context *c) { struct user_datum *usrdatum; if (!p->mls_enabled) return 1; if (!mls_range_isvalid(p, &c->range)) return 0; if (c->role == OBJECT_R_VAL) return 1; /* * User must be authorized for the MLS range. */ if (!c->user || c->user > p->p_users.nprim) return 0; usrdatum = p->user_val_to_struct[c->user - 1]; if (!mls_range_contains(usrdatum->range, c->range)) return 0; /* user may not be associated with range */ return 1; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `scontext'. * * This function modifies the string in place, inserting * NULL characters to terminate the MLS fields. * * If a def_sid is provided and no MLS field is present, * copy the MLS field of the associated default context. * Used for upgraded to MLS systems where objects may lack * MLS fields. * * Policy read-lock must be held for sidtab lookup. * */ int mls_context_to_sid(struct policydb *pol, char oldc, char *scontext, struct context *context, struct sidtab *s, u32 def_sid) { char *sensitivity, *cur_cat, *next_cat, *rngptr; struct level_datum *levdatum; struct cat_datum *catdatum, *rngdatum; u32 i; int l, rc; char *rangep[2]; if (!pol->mls_enabled) { /* * With no MLS, only return -EINVAL if there is a MLS field * and it did not come from an xattr. */ if (oldc && def_sid == SECSID_NULL) return -EINVAL; return 0; } /* * No MLS component to the security context, try and map to * default if provided. */ if (!oldc) { struct context *defcon; if (def_sid == SECSID_NULL) return -EINVAL; defcon = sidtab_search(s, def_sid); if (!defcon) return -EINVAL; return mls_context_cpy(context, defcon); } /* * If we're dealing with a range, figure out where the two parts * of the range begin. */ rangep[0] = scontext; rangep[1] = strchr(scontext, '-'); if (rangep[1]) { rangep[1][0] = '\0'; rangep[1]++; } /* For each part of the range: */ for (l = 0; l < 2; l++) { /* Split sensitivity and category set. */ sensitivity = rangep[l]; if (sensitivity == NULL) break; next_cat = strchr(sensitivity, ':'); if (next_cat) *(next_cat++) = '\0'; /* Parse sensitivity. */ levdatum = symtab_search(&pol->p_levels, sensitivity); if (!levdatum) return -EINVAL; context->range.level[l].sens = levdatum->level.sens; /* Extract category set. */ while (next_cat != NULL) { cur_cat = next_cat; next_cat = strchr(next_cat, ','); if (next_cat != NULL) *(next_cat++) = '\0'; /* Separate into range if exists */ rngptr = strchr(cur_cat, '.'); if (rngptr != NULL) { /* Remove '.' */ *rngptr++ = '\0'; } catdatum = symtab_search(&pol->p_cats, cur_cat); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&context->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; /* If range, set all categories in range */ if (rngptr == NULL) continue; rngdatum = symtab_search(&pol->p_cats, rngptr); if (!rngdatum) return -EINVAL; if (catdatum->value >= rngdatum->value) return -EINVAL; for (i = catdatum->value; i < rngdatum->value; i++) { rc = ebitmap_set_bit( &context->range.level[l].cat, i, 1); if (rc) return rc; } } } /* If we didn't see a '-', the range start is also the range end. */ if (rangep[1] == NULL) { context->range.level[1].sens = context->range.level[0].sens; rc = ebitmap_cpy(&context->range.level[1].cat, &context->range.level[0].cat); if (rc) return rc; } return 0; } /* * Set the MLS fields in the security context structure * `context' based on the string representation in * the string `str'. This function will allocate temporary memory with the * given constraints of gfp_mask. */ int mls_from_string(struct policydb *p, char *str, struct context *context, gfp_t gfp_mask) { char *tmpstr; int rc; if (!p->mls_enabled) return -EINVAL; tmpstr = kstrdup(str, gfp_mask); if (!tmpstr) { rc = -ENOMEM; } else { rc = mls_context_to_sid(p, ':', tmpstr, context, NULL, SECSID_NULL); kfree(tmpstr); } return rc; } /* * Copies the MLS range `range' into `context'. */ int mls_range_set(struct context *context, struct mls_range *range) { int l, rc = 0; /* Copy the MLS range into the context */ for (l = 0; l < 2; l++) { context->range.level[l].sens = range->level[l].sens; rc = ebitmap_cpy(&context->range.level[l].cat, &range->level[l].cat); if (rc) break; } return rc; } int mls_setup_user_range(struct policydb *p, struct context *fromcon, struct user_datum *user, struct context *usercon) { if (p->mls_enabled) { struct mls_level *fromcon_sen = &(fromcon->range.level[0]); struct mls_level *fromcon_clr = &(fromcon->range.level[1]); struct mls_level *user_low = &(user->range.level[0]); struct mls_level *user_clr = &(user->range.level[1]); struct mls_level *user_def = &(user->dfltlevel); struct mls_level *usercon_sen = &(usercon->range.level[0]); struct mls_level *usercon_clr = &(usercon->range.level[1]); /* Honor the user's default level if we can */ if (mls_level_between(user_def, fromcon_sen, fromcon_clr)) *usercon_sen = *user_def; else if (mls_level_between(fromcon_sen, user_def, user_clr)) *usercon_sen = *fromcon_sen; else if (mls_level_between(fromcon_clr, user_low, user_def)) *usercon_sen = *user_low; else return -EINVAL; /* Lower the clearance of available contexts if the clearance of "fromcon" is lower than that of the user's default clearance (but only if the "fromcon" clearance dominates the user's computed sensitivity level) */ if (mls_level_dom(user_clr, fromcon_clr)) *usercon_clr = *fromcon_clr; else if (mls_level_dom(fromcon_clr, user_clr)) *usercon_clr = *user_clr; else return -EINVAL; } return 0; } /* * Convert the MLS fields in the security context * structure `oldc' from the values specified in the * policy `oldp' to the values specified in the policy `newp', * storing the resulting context in `newc'. */ int mls_convert_context(struct policydb *oldp, struct policydb *newp, struct context *oldc, struct context *newc) { struct level_datum *levdatum; struct cat_datum *catdatum; struct ebitmap_node *node; u32 i; int l; if (!oldp->mls_enabled || !newp->mls_enabled) return 0; for (l = 0; l < 2; l++) { char *name = sym_name(oldp, SYM_LEVELS, oldc->range.level[l].sens - 1); levdatum = symtab_search(&newp->p_levels, name); if (!levdatum) return -EINVAL; newc->range.level[l].sens = levdatum->level.sens; ebitmap_for_each_positive_bit(&oldc->range.level[l].cat, node, i) { int rc; catdatum = symtab_search(&newp->p_cats, sym_name(oldp, SYM_CATS, i)); if (!catdatum) return -EINVAL; rc = ebitmap_set_bit(&newc->range.level[l].cat, catdatum->value - 1, 1); if (rc) return rc; } } return 0; } int mls_compute_sid(struct policydb *p, struct context *scontext, struct context *tcontext, u16 tclass, u32 specified, struct context *newcontext, bool sock) { struct range_trans rtr; struct mls_range *r; struct class_datum *cladatum; char default_range = 0; if (!p->mls_enabled) return 0; switch (specified) { case AVTAB_TRANSITION: /* Look for a range transition rule. */ rtr.source_type = scontext->type; rtr.target_type = tcontext->type; rtr.target_class = tclass; r = policydb_rangetr_search(p, &rtr); if (r) return mls_range_set(newcontext, r); if (tclass && tclass <= p->p_classes.nprim) { cladatum = p->class_val_to_struct[tclass - 1]; if (cladatum) default_range = cladatum->default_range; } switch (default_range) { case DEFAULT_SOURCE_LOW: return mls_context_cpy_low(newcontext, scontext); case DEFAULT_SOURCE_HIGH: return mls_context_cpy_high(newcontext, scontext); case DEFAULT_SOURCE_LOW_HIGH: return mls_context_cpy(newcontext, scontext); case DEFAULT_TARGET_LOW: return mls_context_cpy_low(newcontext, tcontext); case DEFAULT_TARGET_HIGH: return mls_context_cpy_high(newcontext, tcontext); case DEFAULT_TARGET_LOW_HIGH: return mls_context_cpy(newcontext, tcontext); case DEFAULT_GLBLUB: return mls_context_glblub(newcontext, scontext, tcontext); } fallthrough; case AVTAB_CHANGE: if ((tclass == p->process_class) || sock) /* Use the process MLS attributes. */ return mls_context_cpy(newcontext, scontext); else /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); case AVTAB_MEMBER: /* Use the process effective MLS attributes. */ return mls_context_cpy_low(newcontext, scontext); } return -EINVAL; } #ifdef CONFIG_NETLABEL /** * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS sensitivity level into the * NetLabel MLS sensitivity level field. * */ void mls_export_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; secattr->attr.mls.lvl = context->range.level[0].sens - 1; secattr->flags |= NETLBL_SECATTR_MLS_LVL; } /** * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context and the NetLabel security attributes, copy the * NetLabel MLS sensitivity level into the context. * */ void mls_import_netlbl_lvl(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { if (!p->mls_enabled) return; context->range.level[0].sens = secattr->attr.mls.lvl + 1; context->range.level[1].sens = context->range.level[0].sens; } /** * mls_export_netlbl_cat - Export the MLS categories to NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Given the security context copy the low MLS categories into the NetLabel * MLS category field. Returns zero on success, negative values on failure. * */ int mls_export_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_export(&context->range.level[0].cat, &secattr->attr.mls.cat); if (rc == 0 && secattr->attr.mls.cat != NULL) secattr->flags |= NETLBL_SECATTR_MLS_CAT; return rc; } /** * mls_import_netlbl_cat - Import the MLS categories from NetLabel * @p: the policy * @context: the security context * @secattr: the NetLabel security attributes * * Description: * Copy the NetLabel security attributes into the SELinux context; since the * NetLabel security attribute only contains a single MLS category use it for * both the low and high categories of the context. Returns zero on success, * negative values on failure. * */ int mls_import_netlbl_cat(struct policydb *p, struct context *context, struct netlbl_lsm_secattr *secattr) { int rc; if (!p->mls_enabled) return 0; rc = ebitmap_netlbl_import(&context->range.level[0].cat, secattr->attr.mls.cat); if (rc) goto import_netlbl_cat_failure; memcpy(&context->range.level[1].cat, &context->range.level[0].cat, sizeof(context->range.level[0].cat)); return 0; import_netlbl_cat_failure: ebitmap_destroy(&context->range.level[0].cat); return rc; } #endif /* CONFIG_NETLABEL */
1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 7 14 14 241 153 86 85 86 86 40 3 40 1 40 1 1 5 1 4 4 1 2 2 1 1 4 35 115 101 3 99 99 2 97 24 7 3 3 2 1 86 84 2 85 88 87 88 88 73 71 72 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 // SPDX-License-Identifier: GPL-2.0-only #include <linux/irqchip/arm-gic-v3.h> #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/kstrtox.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/string_choices.h> #include <kvm/arm_vgic.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/kvm_asm.h> #include "vgic.h" static bool group0_trap; static bool group1_trap; static bool common_trap; static bool dir_trap; static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; cpuif->vgic_hcr |= ICH_HCR_EL2_UIE; } static bool lr_signals_eoi_mi(u64 lr_val) { return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && !(lr_val & ICH_LR_HW); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; int lr; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE; for (lr = 0; lr < cpuif->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; u32 intid, cpuid; struct vgic_irq *irq; bool is_v2_sgi = false; bool deactivated; cpuid = val & GICH_LR_PHYSID_CPUID; cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; if (model == KVM_DEV_TYPE_ARM_VGIC_V3) { intid = val & ICH_LR_VIRTUAL_ID_MASK; } else { intid = val & GICH_LR_VIRTUALID; is_v2_sgi = vgic_irq_is_sgi(intid); } /* Notify fds when the guest EOI'ed a level-triggered IRQ */ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) kvm_notify_acked_irq(vcpu->kvm, 0, intid - VGIC_NR_PRIVATE_IRQS); irq = vgic_get_vcpu_irq(vcpu, intid); if (!irq) /* An LPI could have been unmapped. */ continue; raw_spin_lock(&irq->irq_lock); /* Always preserve the active bit, note deactivation */ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); if (irq->active && is_v2_sgi) irq->active_source = cpuid; /* Edge is the only case where we preserve the pending bit */ if (irq->config == VGIC_CONFIG_EDGE && (val & ICH_LR_PENDING_BIT)) { irq->pending_latch = true; if (is_v2_sgi) irq->source |= (1 << cpuid); } /* * Clear soft pending state when level irqs have been acked. */ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; /* Handle resampling for mapped interrupts if required */ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } cpuif->used_lrs = 0; } /* Requires the irq to be locked already */ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) { u32 model = vcpu->kvm->arch.vgic.vgic_model; u64 val = irq->intid; bool allow_pending = true, is_v2_sgi; is_v2_sgi = (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2); if (irq->active) { val |= ICH_LR_ACTIVE_BIT; if (is_v2_sgi) val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT; if (vgic_irq_is_multi_sgi(irq)) { allow_pending = false; val |= ICH_LR_EOI; } } if (irq->hw && !vgic_irq_needs_resampling(irq)) { val |= ICH_LR_HW; val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT; /* * Never set pending+active on a HW interrupt, as the * pending state is kept at the physical distributor * level. */ if (irq->active) allow_pending = false; } else { if (irq->config == VGIC_CONFIG_LEVEL) { val |= ICH_LR_EOI; /* * Software resampling doesn't work very well * if we allow P+A, so let's not do that. */ if (irq->active) allow_pending = false; } } if (allow_pending && irq_is_pending(irq)) { val |= ICH_LR_PENDING_BIT; if (irq->config == VGIC_CONFIG_EDGE) irq->pending_latch = false; if (vgic_irq_is_sgi(irq->intid) && model == KVM_DEV_TYPE_ARM_VGIC_V2) { u32 src = ffs(irq->source); if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n", irq->intid)) return; val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT; irq->source &= ~(1 << (src - 1)); if (irq->source) { irq->pending_latch = true; val |= ICH_LR_EOI; } } } /* * Level-triggered mapped IRQs are special because we only observe * rising edges as input to the VGIC. We therefore lower the line * level here, so that we can take new virtual IRQs. See * vgic_v3_fold_lr_state for more info. */ if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) irq->line_level = false; if (irq->group) val |= ICH_LR_GROUP; val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT; vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; } void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) { vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0; } void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; u32 vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) & ICH_VMCR_ACK_CTL_MASK; vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) & ICH_VMCR_FIQ_EN_MASK; } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the * VFIQEn bit is RES1 and the VAckCtl bit is RES0. */ vmcr = ICH_VMCR_FIQ_EN_MASK; } vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK; vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK; vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; cpu_if->vgic_vmcr = vmcr; } void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; u32 vmcr; vmcr = cpu_if->vgic_vmcr; if (model == KVM_DEV_TYPE_ARM_VGIC_V2) { vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >> ICH_VMCR_ACK_CTL_SHIFT; vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >> ICH_VMCR_FIQ_EN_SHIFT; } else { /* * When emulating GICv3 on GICv3 with SRE=1 on the * VFIQEn bit is RES1 and the VAckCtl bit is RES0. */ vmcrp->fiqen = 1; vmcrp->ackctl = 0; } vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT; vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT; vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT; vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT; } #define INITIAL_PENDBASER_VALUE \ (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) void vgic_v3_enable(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; /* * By forcing VMCR to zero, the GIC will restore the binary * points to their reset values. Anything else resets to zero * anyway. */ vgic_v3->vgic_vmcr = 0; /* * If we are emulating a GICv3, we do it in an non-GICv2-compatible * way, so we force SRE to 1 to demonstrate this to the guest. * Also, we don't support any form of IRQ/FIQ bypass. * This goes with the spec allowing the value to be RAO/WI. */ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE); vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; } else { vgic_v3->vgic_sre = 0; } vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits, kvm_vgic_global_state.ich_vtr_el2); vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits, kvm_vgic_global_state.ich_vtr_el2) + 1; /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EL2_En; } void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; /* Hide GICv3 sysreg if necessary */ if (!kvm_has_gicv3(vcpu->kvm)) { vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 | ICH_HCR_EL2_TC); return; } if (group0_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0; if (group1_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1; if (common_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC; if (dir_trap) vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) { struct kvm_vcpu *vcpu; int byte_offset, bit_nr; gpa_t pendbase, ptr; bool status; u8 val; int ret; unsigned long flags; retry: vcpu = irq->target_vcpu; if (!vcpu) return 0; pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); byte_offset = irq->intid / BITS_PER_BYTE; bit_nr = irq->intid % BITS_PER_BYTE; ptr = pendbase + byte_offset; ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; status = val & (1 << bit_nr); raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->target_vcpu != vcpu) { raw_spin_unlock_irqrestore(&irq->irq_lock, flags); goto retry; } irq->pending_latch = status; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } return 0; } /* * The deactivation of the doorbell interrupt will trigger the * unmapping of the associated vPE. */ static void unmap_all_vpes(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; int i; for (i = 0; i < dist->its_vm.nr_vpes; i++) free_irq(dist->its_vm.vpes[i]->irq, kvm_get_vcpu(kvm, i)); } static void map_all_vpes(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; int i; for (i = 0; i < dist->its_vm.nr_vpes; i++) WARN_ON(vgic_v4_request_vpe_irq(kvm_get_vcpu(kvm, i), dist->its_vm.vpes[i]->irq)); } /* * vgic_v3_save_pending_tables - Save the pending tables into guest RAM * kvm lock and all vcpu lock must be held */ int vgic_v3_save_pending_tables(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct vgic_irq *irq; gpa_t last_ptr = ~(gpa_t)0; bool vlpi_avail = false; unsigned long index; int ret = 0; u8 val; if (unlikely(!vgic_initialized(kvm))) return -ENXIO; /* * A preparation for getting any VLPI states. * The above vgic initialized check also ensures that the allocation * and enabling of the doorbells have already been done. */ if (kvm_vgic_global_state.has_gicv4_1) { unmap_all_vpes(kvm); vlpi_avail = true; } xa_for_each(&dist->lpi_xa, index, irq) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; bool is_pending; bool stored; vcpu = irq->target_vcpu; if (!vcpu) continue; pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); byte_offset = irq->intid / BITS_PER_BYTE; bit_nr = irq->intid % BITS_PER_BYTE; ptr = pendbase + byte_offset; if (ptr != last_ptr) { ret = kvm_read_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; last_ptr = ptr; } stored = val & (1U << bit_nr); is_pending = irq->pending_latch; if (irq->hw && vlpi_avail) vgic_v4_get_vlpi_state(irq, &is_pending); if (stored == is_pending) continue; if (is_pending) val |= 1 << bit_nr; else val &= ~(1 << bit_nr); ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; } out: if (vlpi_avail) map_all_vpes(kvm); return ret; } /** * vgic_v3_rdist_overlap - check if a region overlaps with any * existing redistributor region * * @kvm: kvm handle * @base: base of the region * @size: size of region * * Return: true if there is an overlap */ bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, &d->rd_regions, list) { if ((base + size > rdreg->base) && (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg))) return true; } return false; } /* * Check for overlapping regions and for regions crossing the end of memory * for base addresses which have already been set. */ bool vgic_v3_check_base(struct kvm *kvm) { struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) && d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base) return false; list_for_each_entry(rdreg, &d->rd_regions, list) { size_t sz = vgic_v3_rd_region_size(kvm, rdreg); if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF, rdreg->base, SZ_64K, sz)) return false; } if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base)) return true; return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base, KVM_VGIC_V3_DIST_SIZE); } /** * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one * which has free space to put a new rdist region. * * @rd_regions: redistributor region list head * * A redistributor regions maps n redistributors, n = region size / (2 x 64kB). * Stride between redistributors is 0 and regions are filled in the index order. * * Return: the redist region handle, if any, that has space to map a new rdist * region. */ struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions) { struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, rd_regions, list) { if (!vgic_v3_redist_region_full(rdreg)) return rdreg; } return NULL; } struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, u32 index) { struct list_head *rd_regions = &kvm->arch.vgic.rd_regions; struct vgic_redist_region *rdreg; list_for_each_entry(rdreg, rd_regions, list) { if (rdreg->index == index) return rdreg; } return NULL; } int vgic_v3_map_resources(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; unsigned long c; kvm_for_each_vcpu(c, vcpu, kvm) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { kvm_debug("vcpu %ld redistributor base not set\n", c); return -ENXIO; } } if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { kvm_debug("Need to set vgic distributor addresses first\n"); return -ENXIO; } if (!vgic_v3_check_base(kvm)) { kvm_debug("VGIC redist and dist frames overlap\n"); return -EINVAL; } /* * For a VGICv3 we require the userland to explicitly initialize * the VGIC before we need to use it. */ if (!vgic_initialized(kvm)) { return -EBUSY; } if (kvm_vgic_global_state.has_gicv4_1) vgic_v4_configure_vsgis(kvm); return 0; } DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); static int __init early_group0_trap_cfg(char *buf) { return kstrtobool(buf, &group0_trap); } early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg); static int __init early_group1_trap_cfg(char *buf) { return kstrtobool(buf, &group1_trap); } early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg); static int __init early_common_trap_cfg(char *buf) { return kstrtobool(buf, &common_trap); } early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); static int __init early_gicv4_enable(char *buf) { return kstrtobool(buf, &gicv4_enable); } early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), {}, }; static bool vgic_v3_broken_seis(void) { return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) && is_midr_in_range_list(broken_seis)); } /** * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller * @info: pointer to the GIC description * * Returns 0 if the VGICv3 has been probed successfully, returns an error code * otherwise */ int vgic_v3_probe(const struct gic_kvm_info *info) { u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config); bool has_v2; int ret; has_v2 = ich_vtr_el2 >> 63; ich_vtr_el2 = (u32)ich_vtr_el2; /* * The ListRegs field is 5 bits, but there is an architectural * maximum of 16 list registers. Just ignore bit 4... */ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1; kvm_vgic_global_state.can_emulate_gicv2 = false; kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; /* GICv4 support? */ if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; kvm_info("GICv4%s support %s\n", kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", str_enabled_disabled(gicv4_enable)); } kvm_vgic_global_state.vcpu_base = 0; if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); } else if (!has_v2) { pr_warn(FW_BUG "CPU interface incapable of MMIO access\n"); } else if (!PAGE_ALIGNED(info->vcpu.start)) { pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); } else if (kvm_get_mode() != KVM_MODE_PROTECTED) { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); if (ret) { kvm_err("Cannot register GICv2 KVM device.\n"); return ret; } kvm_info("vgic-v2@%llx\n", info->vcpu.start); } ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); if (ret) { kvm_err("Cannot register GICv3 KVM device.\n"); kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); return ret; } if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) { group0_trap = true; group1_trap = true; } if (vgic_v3_broken_seis()) { kvm_info("GICv3 with broken locally generated SEI\n"); kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS; group0_trap = true; group1_trap = true; if (ich_vtr_el2 & ICH_VTR_EL2_TDS) dir_trap = true; else common_trap = true; } if (group0_trap || group1_trap || common_trap | dir_trap) { kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", group0_trap ? "G0" : "", group1_trap ? "G1" : "", common_trap ? "C" : "", dir_trap ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } kvm_vgic_global_state.vctrl_base = NULL; kvm_vgic_global_state.type = VGIC_V3; kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS; return 0; } void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; /* If the vgic is nested, perform the full state loading */ if (vgic_state_is_nested(vcpu)) { vgic_v3_load_nested(vcpu); return; } if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); WARN_ON(vgic_v4_load(vcpu)); } void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (vgic_state_is_nested(vcpu)) { vgic_v3_put_nested(vcpu); return; } if (likely(!is_protected_kvm_enabled())) kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) __vgic_v3_deactivate_traps(cpu_if); }
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 4 4 4 70 4 70 71 71 69 70 27 71 70 70 27 44 26 41 199 199 198 198 198 199 199 199 198 199 199 167 41 199 199 199 199 199 199 279 279 279 279 279 279 279 279 279 279 279 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 // SPDX-License-Identifier: GPL-2.0-only /* * Implementation of the security services. * * Authors : Stephen Smalley, <stephen.smalley.work@gmail.com> * James Morris <jmorris@redhat.com> * * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * * Support for enhanced MLS infrastructure. * Support for context based audit filters. * * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> * * Added conditional policy language extensions * * Updated: Hewlett-Packard <paul@paul-moore.com> * * Added support for NetLabel * Added support for the policy capability bitmap * * Updated: Chad Sellers <csellers@tresys.com> * * Added validation of kernel classes and permissions * * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com> * * Added support for bounds domain and audit messaged on masked permissions * * Updated: Guido Trentalancia <guido@trentalancia.com> * * Added support for runtime switching of the policy type * * Copyright (C) 2008, 2009 NEC Corporation * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/sched.h> #include <linux/audit.h> #include <linux/parser.h> #include <linux/vmalloc.h> #include <linux/lsm_hooks.h> #include <net/netlabel.h> #include "flask.h" #include "avc.h" #include "avc_ss.h" #include "security.h" #include "context.h" #include "policydb.h" #include "sidtab.h" #include "services.h" #include "conditional.h" #include "mls.h" #include "objsec.h" #include "netlabel.h" #include "xfrm.h" #include "ebitmap.h" #include "audit.h" #include "policycap_names.h" #include "ima.h" struct selinux_policy_convert_data { struct convert_context_args args; struct sidtab_convert_params sidtab_params; }; /* Forward declaration. */ static int context_struct_to_string(struct policydb *policydb, struct context *context, char **scontext, u32 *scontext_len); static int sidtab_entry_to_string(struct policydb *policydb, struct sidtab *sidtab, struct sidtab_entry *entry, char **scontext, u32 *scontext_len); static void context_struct_compute_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd, struct extended_perms *xperms); static int selinux_set_mapping(struct policydb *pol, const struct security_class_mapping *map, struct selinux_map *out_map) { u16 i, j; bool print_unknown_handle = false; /* Find number of classes in the input mapping */ if (!map) return -EINVAL; i = 0; while (map[i].name) i++; /* Allocate space for the class records, plus one for class zero */ out_map->mapping = kcalloc(++i, sizeof(*out_map->mapping), GFP_ATOMIC); if (!out_map->mapping) return -ENOMEM; /* Store the raw class and permission values */ j = 0; while (map[j].name) { const struct security_class_mapping *p_in = map + (j++); struct selinux_mapping *p_out = out_map->mapping + j; u16 k; /* An empty class string skips ahead */ if (!strcmp(p_in->name, "")) { p_out->num_perms = 0; continue; } p_out->value = string_to_security_class(pol, p_in->name); if (!p_out->value) { pr_info("SELinux: Class %s not defined in policy.\n", p_in->name); if (pol->reject_unknown) goto err; p_out->num_perms = 0; print_unknown_handle = true; continue; } k = 0; while (p_in->perms[k]) { /* An empty permission string skips ahead */ if (!*p_in->perms[k]) { k++; continue; } p_out->perms[k] = string_to_av_perm(pol, p_out->value, p_in->perms[k]); if (!p_out->perms[k]) { pr_info("SELinux: Permission %s in class %s not defined in policy.\n", p_in->perms[k], p_in->name); if (pol->reject_unknown) goto err; print_unknown_handle = true; } k++; } p_out->num_perms = k; } if (print_unknown_handle) pr_info("SELinux: the above unknown classes and permissions will be %s\n", pol->allow_unknown ? "allowed" : "denied"); out_map->size = i; return 0; err: kfree(out_map->mapping); out_map->mapping = NULL; return -EINVAL; } /* * Get real, policy values from mapped values */ static u16 unmap_class(struct selinux_map *map, u16 tclass) { if (tclass < map->size) return map->mapping[tclass].value; return tclass; } /* * Get kernel value for class from its policy value */ static u16 map_class(struct selinux_map *map, u16 pol_value) { u16 i; for (i = 1; i < map->size; i++) { if (map->mapping[i].value == pol_value) return i; } return SECCLASS_NULL; } static void map_decision(struct selinux_map *map, u16 tclass, struct av_decision *avd, int allow_unknown) { if (tclass < map->size) { struct selinux_mapping *mapping = &map->mapping[tclass]; unsigned int i, n = mapping->num_perms; u32 result; for (i = 0, result = 0; i < n; i++) { if (avd->allowed & mapping->perms[i]) result |= (u32)1<<i; if (allow_unknown && !mapping->perms[i]) result |= (u32)1<<i; } avd->allowed = result; for (i = 0, result = 0; i < n; i++) if (avd->auditallow & mapping->perms[i]) result |= (u32)1<<i; avd->auditallow = result; for (i = 0, result = 0; i < n; i++) { if (avd->auditdeny & mapping->perms[i]) result |= (u32)1<<i; if (!allow_unknown && !mapping->perms[i]) result |= (u32)1<<i; } /* * In case the kernel has a bug and requests a permission * between num_perms and the maximum permission number, we * should audit that denial */ for (; i < (sizeof(u32)*8); i++) result |= (u32)1<<i; avd->auditdeny = result; } } int security_mls_enabled(void) { int mls_enabled; struct selinux_policy *policy; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); mls_enabled = policy->policydb.mls_enabled; rcu_read_unlock(); return mls_enabled; } /* * Return the boolean value of a constraint expression * when it is applied to the specified source and target * security contexts. * * xcontext is a special beast... It is used by the validatetrans rules * only. For these rules, scontext is the context before the transition, * tcontext is the context after the transition, and xcontext is the context * of the process performing the transition. All other callers of * constraint_expr_eval should pass in NULL for xcontext. */ static int constraint_expr_eval(struct policydb *policydb, struct context *scontext, struct context *tcontext, struct context *xcontext, struct constraint_expr *cexpr) { u32 val1, val2; struct context *c; struct role_datum *r1, *r2; struct mls_level *l1, *l2; struct constraint_expr *e; int s[CEXPR_MAXDEPTH]; int sp = -1; for (e = cexpr; e; e = e->next) { switch (e->expr_type) { case CEXPR_NOT: BUG_ON(sp < 0); s[sp] = !s[sp]; break; case CEXPR_AND: BUG_ON(sp < 1); sp--; s[sp] &= s[sp + 1]; break; case CEXPR_OR: BUG_ON(sp < 1); sp--; s[sp] |= s[sp + 1]; break; case CEXPR_ATTR: if (sp == (CEXPR_MAXDEPTH - 1)) return 0; switch (e->attr) { case CEXPR_USER: val1 = scontext->user; val2 = tcontext->user; break; case CEXPR_TYPE: val1 = scontext->type; val2 = tcontext->type; break; case CEXPR_ROLE: val1 = scontext->role; val2 = tcontext->role; r1 = policydb->role_val_to_struct[val1 - 1]; r2 = policydb->role_val_to_struct[val2 - 1]; switch (e->op) { case CEXPR_DOM: s[++sp] = ebitmap_get_bit(&r1->dominates, val2 - 1); continue; case CEXPR_DOMBY: s[++sp] = ebitmap_get_bit(&r2->dominates, val1 - 1); continue; case CEXPR_INCOMP: s[++sp] = (!ebitmap_get_bit(&r1->dominates, val2 - 1) && !ebitmap_get_bit(&r2->dominates, val1 - 1)); continue; default: break; } break; case CEXPR_L1L2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_L1H2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_H1L2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_H1H2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_L1H1: l1 = &(scontext->range.level[0]); l2 = &(scontext->range.level[1]); goto mls_ops; case CEXPR_L2H2: l1 = &(tcontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; mls_ops: switch (e->op) { case CEXPR_EQ: s[++sp] = mls_level_eq(l1, l2); continue; case CEXPR_NEQ: s[++sp] = !mls_level_eq(l1, l2); continue; case CEXPR_DOM: s[++sp] = mls_level_dom(l1, l2); continue; case CEXPR_DOMBY: s[++sp] = mls_level_dom(l2, l1); continue; case CEXPR_INCOMP: s[++sp] = mls_level_incomp(l2, l1); continue; default: BUG(); return 0; } break; default: BUG(); return 0; } switch (e->op) { case CEXPR_EQ: s[++sp] = (val1 == val2); break; case CEXPR_NEQ: s[++sp] = (val1 != val2); break; default: BUG(); return 0; } break; case CEXPR_NAMES: if (sp == (CEXPR_MAXDEPTH-1)) return 0; c = scontext; if (e->attr & CEXPR_TARGET) c = tcontext; else if (e->attr & CEXPR_XTARGET) { c = xcontext; if (!c) { BUG(); return 0; } } if (e->attr & CEXPR_USER) val1 = c->user; else if (e->attr & CEXPR_ROLE) val1 = c->role; else if (e->attr & CEXPR_TYPE) val1 = c->type; else { BUG(); return 0; } switch (e->op) { case CEXPR_EQ: s[++sp] = ebitmap_get_bit(&e->names, val1 - 1); break; case CEXPR_NEQ: s[++sp] = !ebitmap_get_bit(&e->names, val1 - 1); break; default: BUG(); return 0; } break; default: BUG(); return 0; } } BUG_ON(sp != 0); return s[0]; } /* * security_dump_masked_av - dumps masked permissions during * security_compute_av due to RBAC, MLS/Constraint and Type bounds. */ static int dump_masked_av_helper(void *k, void *d, void *args) { struct perm_datum *pdatum = d; char **permission_names = args; BUG_ON(pdatum->value < 1 || pdatum->value > 32); permission_names[pdatum->value - 1] = (char *)k; return 0; } static void security_dump_masked_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, u32 permissions, const char *reason) { struct common_datum *common_dat; struct class_datum *tclass_dat; struct audit_buffer *ab; char *tclass_name; char *scontext_name = NULL; char *tcontext_name = NULL; char *permission_names[32]; int index; u32 length; bool need_comma = false; if (!permissions) return; tclass_name = sym_name(policydb, SYM_CLASSES, tclass - 1); tclass_dat = policydb->class_val_to_struct[tclass - 1]; common_dat = tclass_dat->comdatum; /* init permission_names */ if (common_dat && hashtab_map(&common_dat->permissions.table, dump_masked_av_helper, permission_names) < 0) goto out; if (hashtab_map(&tclass_dat->permissions.table, dump_masked_av_helper, permission_names) < 0) goto out; /* get scontext/tcontext in text form */ if (context_struct_to_string(policydb, scontext, &scontext_name, &length) < 0) goto out; if (context_struct_to_string(policydb, tcontext, &tcontext_name, &length) < 0) goto out; /* audit a message */ ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) goto out; audit_log_format(ab, "op=security_compute_av reason=%s " "scontext=%s tcontext=%s tclass=%s perms=", reason, scontext_name, tcontext_name, tclass_name); for (index = 0; index < 32; index++) { u32 mask = (1 << index); if ((mask & permissions) == 0) continue; audit_log_format(ab, "%s%s", need_comma ? "," : "", permission_names[index] ? permission_names[index] : "????"); need_comma = true; } audit_log_end(ab); out: /* release scontext/tcontext */ kfree(tcontext_name); kfree(scontext_name); } /* * security_boundary_permission - drops violated permissions * on boundary constraint. */ static void type_attribute_bounds_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd) { struct context lo_scontext; struct context lo_tcontext, *tcontextp = tcontext; struct av_decision lo_avd; struct type_datum *source; struct type_datum *target; u32 masked = 0; source = policydb->type_val_to_struct[scontext->type - 1]; BUG_ON(!source); if (!source->bounds) return; target = policydb->type_val_to_struct[tcontext->type - 1]; BUG_ON(!target); memset(&lo_avd, 0, sizeof(lo_avd)); memcpy(&lo_scontext, scontext, sizeof(lo_scontext)); lo_scontext.type = source->bounds; if (target->bounds) { memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext)); lo_tcontext.type = target->bounds; tcontextp = &lo_tcontext; } context_struct_compute_av(policydb, &lo_scontext, tcontextp, tclass, &lo_avd, NULL); masked = ~lo_avd.allowed & avd->allowed; if (likely(!masked)) return; /* no masked permission */ /* mask violated permissions */ avd->allowed &= ~masked; /* audit masked permissions */ security_dump_masked_av(policydb, scontext, tcontext, tclass, masked, "bounds"); } /* * Flag which drivers have permissions and which base permissions are covered. */ void services_compute_xperms_drivers( struct extended_perms *xperms, struct avtab_node *node) { unsigned int i; switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLDRIVER: xperms->base_perms |= AVC_EXT_IOCTL; /* if one or more driver has all permissions allowed */ for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; break; case AVTAB_XPERMS_IOCTLFUNCTION: xperms->base_perms |= AVC_EXT_IOCTL; /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); break; case AVTAB_XPERMS_NLMSG: xperms->base_perms |= AVC_EXT_NLMSG; /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); break; } xperms->len = 1; } /* * Compute access vectors and extended permissions based on a context * structure pair for the permissions in a particular class. */ static void context_struct_compute_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd, struct extended_perms *xperms) { struct constraint_node *constraint; struct role_allow *ra; struct avtab_key avkey; struct avtab_node *node; struct class_datum *tclass_datum; struct ebitmap *sattr, *tattr; struct ebitmap_node *snode, *tnode; unsigned int i, j; avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (xperms) { memset(xperms, 0, sizeof(*xperms)); } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { pr_warn_ratelimited("SELinux: Invalid class %u\n", tclass); return; } tclass_datum = policydb->class_val_to_struct[tclass - 1]; /* * If a specific type enforcement rule was defined for * this permission check, then use it. */ avkey.target_class = tclass; avkey.specified = AVTAB_AV | AVTAB_XPERMS; sattr = &policydb->type_attr_map_array[scontext->type - 1]; tattr = &policydb->type_attr_map_array[tcontext->type - 1]; ebitmap_for_each_positive_bit(sattr, snode, i) { ebitmap_for_each_positive_bit(tattr, tnode, j) { avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb->te_avtab, &avkey); node; node = avtab_search_node_next(node, avkey.specified)) { if (node->key.specified == AVTAB_ALLOWED) avd->allowed |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITALLOW) avd->auditallow |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITDENY) avd->auditdeny &= node->datum.u.data; else if (xperms && (node->key.specified & AVTAB_XPERMS)) services_compute_xperms_drivers(xperms, node); } /* Check conditional av table for additional permissions */ cond_compute_av(&policydb->te_cond_avtab, &avkey, avd, xperms); } } /* * Remove any permissions prohibited by a constraint (this includes * the MLS policy). */ constraint = tclass_datum->constraints; while (constraint) { if ((constraint->permissions & (avd->allowed)) && !constraint_expr_eval(policydb, scontext, tcontext, NULL, constraint->expr)) { avd->allowed &= ~(constraint->permissions); } constraint = constraint->next; } /* * If checking process transition permission and the * role is changing, then check the (current_role, new_role) * pair. */ if (tclass == policydb->process_class && (avd->allowed & policydb->process_trans_perms) && scontext->role != tcontext->role) { for (ra = policydb->role_allow; ra; ra = ra->next) { if (scontext->role == ra->role && tcontext->role == ra->new_role) break; } if (!ra) avd->allowed &= ~policydb->process_trans_perms; } /* * If the given source and target types have boundary * constraint, lazy checks have to mask any violated * permission and notice it to userspace via audit. */ type_attribute_bounds_av(policydb, scontext, tcontext, tclass, avd); } static int security_validtrans_handle_fail(struct selinux_policy *policy, struct sidtab_entry *oentry, struct sidtab_entry *nentry, struct sidtab_entry *tentry, u16 tclass) { struct policydb *p = &policy->policydb; struct sidtab *sidtab = policy->sidtab; char *o = NULL, *n = NULL, *t = NULL; u32 olen, nlen, tlen; if (sidtab_entry_to_string(p, sidtab, oentry, &o, &olen)) goto out; if (sidtab_entry_to_string(p, sidtab, nentry, &n, &nlen)) goto out; if (sidtab_entry_to_string(p, sidtab, tentry, &t, &tlen)) goto out; audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR, "op=security_validate_transition seresult=denied" " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s", o, n, t, sym_name(p, SYM_CLASSES, tclass-1)); out: kfree(o); kfree(n); kfree(t); if (!enforcing_enabled()) return 0; return -EPERM; } static int security_compute_validatetrans(u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass, bool user) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *oentry; struct sidtab_entry *nentry; struct sidtab_entry *tentry; struct class_datum *tclass_datum; struct constraint_node *constraint; u16 tclass; int rc = 0; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (!user) tclass = unmap_class(&policy->map, orig_tclass); else tclass = orig_tclass; if (!tclass || tclass > policydb->p_classes.nprim) { rc = -EINVAL; goto out; } tclass_datum = policydb->class_val_to_struct[tclass - 1]; oentry = sidtab_search_entry(sidtab, oldsid); if (!oentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, oldsid); rc = -EINVAL; goto out; } nentry = sidtab_search_entry(sidtab, newsid); if (!nentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, newsid); rc = -EINVAL; goto out; } tentry = sidtab_search_entry(sidtab, tasksid); if (!tentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tasksid); rc = -EINVAL; goto out; } constraint = tclass_datum->validatetrans; while (constraint) { if (!constraint_expr_eval(policydb, &oentry->context, &nentry->context, &tentry->context, constraint->expr)) { if (user) rc = -EPERM; else rc = security_validtrans_handle_fail(policy, oentry, nentry, tentry, tclass); goto out; } constraint = constraint->next; } out: rcu_read_unlock(); return rc; } int security_validate_transition_user(u32 oldsid, u32 newsid, u32 tasksid, u16 tclass) { return security_compute_validatetrans(oldsid, newsid, tasksid, tclass, true); } int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass) { return security_compute_validatetrans(oldsid, newsid, tasksid, orig_tclass, false); } /* * security_bounded_transition - check whether the given * transition is directed to bounded, or not. * It returns 0, if @newsid is bounded by @oldsid. * Otherwise, it returns error code. * * @oldsid : current security identifier * @newsid : destinated security identifier */ int security_bounded_transition(u32 old_sid, u32 new_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *old_entry, *new_entry; struct type_datum *type; u32 index; int rc; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; rc = -EINVAL; old_entry = sidtab_search_entry(sidtab, old_sid); if (!old_entry) { pr_err("SELinux: %s: unrecognized SID %u\n", __func__, old_sid); goto out; } rc = -EINVAL; new_entry = sidtab_search_entry(sidtab, new_sid); if (!new_entry) { pr_err("SELinux: %s: unrecognized SID %u\n", __func__, new_sid); goto out; } rc = 0; /* type/domain unchanged */ if (old_entry->context.type == new_entry->context.type) goto out; index = new_entry->context.type; while (true) { type = policydb->type_val_to_struct[index - 1]; BUG_ON(!type); /* not bounded anymore */ rc = -EPERM; if (!type->bounds) break; /* @newsid is bounded by @oldsid */ rc = 0; if (type->bounds == old_entry->context.type) break; index = type->bounds; } if (rc) { char *old_name = NULL; char *new_name = NULL; u32 length; if (!sidtab_entry_to_string(policydb, sidtab, old_entry, &old_name, &length) && !sidtab_entry_to_string(policydb, sidtab, new_entry, &new_name, &length)) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR, "op=security_bounded_transition " "seresult=denied " "oldcontext=%s newcontext=%s", old_name, new_name); } kfree(new_name); kfree(old_name); } out: rcu_read_unlock(); return rc; } static void avd_init(struct selinux_policy *policy, struct av_decision *avd) { avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (policy) avd->seqno = policy->latest_granting; else avd->seqno = 0; avd->flags = 0; } static void update_xperms_extended_data(u8 specified, const struct extended_perms_data *from, struct extended_perms_data *xp_data) { unsigned int i; switch (specified) { case AVTAB_XPERMS_IOCTLDRIVER: memset(xp_data->p, 0xff, sizeof(xp_data->p)); break; case AVTAB_XPERMS_IOCTLFUNCTION: case AVTAB_XPERMS_NLMSG: for (i = 0; i < ARRAY_SIZE(xp_data->p); i++) xp_data->p[i] |= from->p[i]; break; } } void services_compute_xperms_decision(struct extended_perms_decision *xpermd, struct avtab_node *node) { u16 specified; switch (node->datum.u.xperms->specified) { case AVTAB_XPERMS_IOCTLFUNCTION: if (xpermd->base_perm != AVC_EXT_IOCTL || xpermd->driver != node->datum.u.xperms->driver) return; break; case AVTAB_XPERMS_IOCTLDRIVER: if (xpermd->base_perm != AVC_EXT_IOCTL || !security_xperm_test(node->datum.u.xperms->perms.p, xpermd->driver)) return; break; case AVTAB_XPERMS_NLMSG: if (xpermd->base_perm != AVC_EXT_NLMSG || xpermd->driver != node->datum.u.xperms->driver) return; break; default: pr_warn_once( "SELinux: unknown extended permission (%u) will be ignored\n", node->datum.u.xperms->specified); return; } specified = node->key.specified & ~(AVTAB_ENABLED | AVTAB_ENABLED_OLD); if (specified == AVTAB_XPERMS_ALLOWED) { xpermd->used |= XPERMS_ALLOWED; update_xperms_extended_data(node->datum.u.xperms->specified, &node->datum.u.xperms->perms, xpermd->allowed); } else if (specified == AVTAB_XPERMS_AUDITALLOW) { xpermd->used |= XPERMS_AUDITALLOW; update_xperms_extended_data(node->datum.u.xperms->specified, &node->datum.u.xperms->perms, xpermd->auditallow); } else if (specified == AVTAB_XPERMS_DONTAUDIT) { xpermd->used |= XPERMS_DONTAUDIT; update_xperms_extended_data(node->datum.u.xperms->specified, &node->datum.u.xperms->perms, xpermd->dontaudit); } else { pr_warn_once("SELinux: unknown specified key (%u)\n", node->key.specified); } } void security_compute_xperms_decision(u32 ssid, u32 tsid, u16 orig_tclass, u8 driver, u8 base_perm, struct extended_perms_decision *xpermd) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; struct context *scontext, *tcontext; struct avtab_key avkey; struct avtab_node *node; struct ebitmap *sattr, *tattr; struct ebitmap_node *snode, *tnode; unsigned int i, j; xpermd->base_perm = base_perm; xpermd->driver = driver; xpermd->used = 0; memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p)); memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p)); rcu_read_lock(); if (!selinux_initialized()) goto allow; policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; goto out; } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { pr_warn_ratelimited("SELinux: Invalid class %hu\n", tclass); goto out; } avkey.target_class = tclass; avkey.specified = AVTAB_XPERMS; sattr = &policydb->type_attr_map_array[scontext->type - 1]; tattr = &policydb->type_attr_map_array[tcontext->type - 1]; ebitmap_for_each_positive_bit(sattr, snode, i) { ebitmap_for_each_positive_bit(tattr, tnode, j) { avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb->te_avtab, &avkey); node; node = avtab_search_node_next(node, avkey.specified)) services_compute_xperms_decision(xpermd, node); cond_compute_xperms(&policydb->te_cond_avtab, &avkey, xpermd); } } out: rcu_read_unlock(); return; allow: memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p)); goto out; } /** * security_compute_av - Compute access vector decisions. * @ssid: source security identifier * @tsid: target security identifier * @orig_tclass: target security class * @avd: access vector decisions * @xperms: extended permissions * * Compute a set of access vector decisions based on the * SID pair (@ssid, @tsid) for the permissions in @tclass. */ void security_compute_av(u32 ssid, u32 tsid, u16 orig_tclass, struct av_decision *avd, struct extended_perms *xperms) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); avd_init(policy, avd); xperms->len = 0; if (!selinux_initialized()) goto allow; policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } /* permissive domain? */ if (ebitmap_get_bit(&policydb->permissive_map, scontext->type)) avd->flags |= AVD_FLAGS_PERMISSIVE; tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; goto out; } context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, xperms); map_decision(&policy->map, orig_tclass, avd, policydb->allow_unknown); out: rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; goto out; } void security_compute_av_user(u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); avd_init(policy, avd); if (!selinux_initialized()) goto allow; policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } /* permissive domain? */ if (ebitmap_get_bit(&policydb->permissive_map, scontext->type)) avd->flags |= AVD_FLAGS_PERMISSIVE; tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } if (unlikely(!tclass)) { if (policydb->allow_unknown) goto allow; goto out; } context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, NULL); out: rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; goto out; } /* * Write the security context string representation of * the context structure `context' into a dynamically * allocated string of the correct size. Set `*scontext' * to point to this string and set `*scontext_len' to * the length of the string. */ static int context_struct_to_string(struct policydb *p, struct context *context, char **scontext, u32 *scontext_len) { char *scontextp; if (scontext) *scontext = NULL; *scontext_len = 0; if (context->len) { *scontext_len = context->len; if (scontext) { *scontext = kstrdup(context->str, GFP_ATOMIC); if (!(*scontext)) return -ENOMEM; } return 0; } /* Compute the size of the context. */ *scontext_len += strlen(sym_name(p, SYM_USERS, context->user - 1)) + 1; *scontext_len += strlen(sym_name(p, SYM_ROLES, context->role - 1)) + 1; *scontext_len += strlen(sym_name(p, SYM_TYPES, context->type - 1)) + 1; *scontext_len += mls_compute_context_len(p, context); if (!scontext) return 0; /* Allocate space for the context; caller must free this space. */ scontextp = kmalloc(*scontext_len, GFP_ATOMIC); if (!scontextp) return -ENOMEM; *scontext = scontextp; /* * Copy the user name, role name and type name into the context. */ scontextp += sprintf(scontextp, "%s:%s:%s", sym_name(p, SYM_USERS, context->user - 1), sym_name(p, SYM_ROLES, context->role - 1), sym_name(p, SYM_TYPES, context->type - 1)); mls_sid_to_context(p, context, &scontextp); *scontextp = 0; return 0; } static int sidtab_entry_to_string(struct policydb *p, struct sidtab *sidtab, struct sidtab_entry *entry, char **scontext, u32 *scontext_len) { int rc = sidtab_sid2str_get(sidtab, entry, scontext, scontext_len); if (rc != -ENOENT) return rc; rc = context_struct_to_string(p, &entry->context, scontext, scontext_len); if (!rc && scontext) sidtab_sid2str_put(sidtab, entry, *scontext, *scontext_len); return rc; } #include "initial_sid_to_string.h" int security_sidtab_hash_stats(char *page) { struct selinux_policy *policy; int rc; if (!selinux_initialized()) { pr_err("SELinux: %s: called before initial load_policy\n", __func__); return -EINVAL; } rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); rc = sidtab_hash_stats(policy->sidtab, page); rcu_read_unlock(); return rc; } const char *security_get_initial_sid_context(u32 sid) { if (unlikely(sid > SECINITSID_NUM)) return NULL; return initial_sid_to_string[sid]; } static int security_sid_to_context_core(u32 sid, char **scontext, u32 *scontext_len, int force, int only_invalid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *entry; int rc = 0; if (scontext) *scontext = NULL; *scontext_len = 0; if (!selinux_initialized()) { if (sid <= SECINITSID_NUM) { char *scontextp; const char *s; /* * Before the policy is loaded, translate * SECINITSID_INIT to "kernel", because systemd and * libselinux < 2.6 take a getcon_raw() result that is * both non-null and not "kernel" to mean that a policy * is already loaded. */ if (sid == SECINITSID_INIT) sid = SECINITSID_KERNEL; s = initial_sid_to_string[sid]; if (!s) return -EINVAL; *scontext_len = strlen(s) + 1; if (!scontext) return 0; scontextp = kmemdup(s, *scontext_len, GFP_ATOMIC); if (!scontextp) return -ENOMEM; *scontext = scontextp; return 0; } pr_err("SELinux: %s: called before initial " "load_policy on unknown SID %d\n", __func__, sid); return -EINVAL; } rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (force) entry = sidtab_search_entry_force(sidtab, sid); else entry = sidtab_search_entry(sidtab, sid); if (!entry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, sid); rc = -EINVAL; goto out_unlock; } if (only_invalid && !entry->context.len) goto out_unlock; rc = sidtab_entry_to_string(policydb, sidtab, entry, scontext, scontext_len); out_unlock: rcu_read_unlock(); return rc; } /** * security_sid_to_context - Obtain a context for a given SID. * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes * * Write the string representation of the context associated with @sid * into a dynamically allocated string of the correct size. Set @scontext * to point to this string and set @scontext_len to the length of the string. */ int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(sid, scontext, scontext_len, 0, 0); } int security_sid_to_context_force(u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(sid, scontext, scontext_len, 1, 0); } /** * security_sid_to_context_inval - Obtain a context for a given SID if it * is invalid. * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes * * Write the string representation of the context associated with @sid * into a dynamically allocated string of the correct size, but only if the * context is invalid in the current policy. Set @scontext to point to * this string (or NULL if the context is valid) and set @scontext_len to * the length of the string (or 0 if the context is valid). */ int security_sid_to_context_inval(u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(sid, scontext, scontext_len, 1, 1); } /* * Caveat: Mutates scontext. */ static int string_to_context_struct(struct policydb *pol, struct sidtab *sidtabp, char *scontext, struct context *ctx, u32 def_sid) { struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *scontextp, *p, oldc; int rc = 0; context_init(ctx); /* Parse the security context. */ rc = -EINVAL; scontextp = scontext; /* Extract the user. */ p = scontextp; while (*p && *p != ':') p++; if (*p == 0) goto out; *p++ = 0; usrdatum = symtab_search(&pol->p_users, scontextp); if (!usrdatum) goto out; ctx->user = usrdatum->value; /* Extract role. */ scontextp = p; while (*p && *p != ':') p++; if (*p == 0) goto out; *p++ = 0; role = symtab_search(&pol->p_roles, scontextp); if (!role) goto out; ctx->role = role->value; /* Extract type. */ scontextp = p; while (*p && *p != ':') p++; oldc = *p; *p++ = 0; typdatum = symtab_search(&pol->p_types, scontextp); if (!typdatum || typdatum->attribute) goto out; ctx->type = typdatum->value; rc = mls_context_to_sid(pol, oldc, p, ctx, sidtabp, def_sid); if (rc) goto out; /* Check the validity of the new context. */ rc = -EINVAL; if (!policydb_context_isvalid(pol, ctx)) goto out; rc = 0; out: if (rc) context_destroy(ctx); return rc; } static int security_context_to_sid_core(const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags, int force) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; char *scontext2, *str = NULL; struct context context; int rc = 0; /* An empty security context is never valid. */ if (!scontext_len) return -EINVAL; /* Copy the string to allow changes and ensure a NUL terminator */ scontext2 = kmemdup_nul(scontext, scontext_len, gfp_flags); if (!scontext2) return -ENOMEM; if (!selinux_initialized()) { u32 i; for (i = 1; i < SECINITSID_NUM; i++) { const char *s = initial_sid_to_string[i]; if (s && !strcmp(s, scontext2)) { *sid = i; goto out; } } *sid = SECINITSID_KERNEL; goto out; } *sid = SECSID_NULL; if (force) { /* Save another copy for storing in uninterpreted form */ rc = -ENOMEM; str = kstrdup(scontext2, gfp_flags); if (!str) goto out; } retry: rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; rc = string_to_context_struct(policydb, sidtab, scontext2, &context, def_sid); if (rc == -EINVAL && force) { context.str = str; context.len = strlen(str) + 1; str = NULL; } else if (rc) goto out_unlock; rc = sidtab_context_to_sid(sidtab, &context, sid); if (rc == -ESTALE) { rcu_read_unlock(); if (context.str) { str = context.str; context.str = NULL; } context_destroy(&context); goto retry; } context_destroy(&context); out_unlock: rcu_read_unlock(); out: kfree(scontext2); kfree(str); return rc; } /** * security_context_to_sid - Obtain a SID for a given security context. * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID * @gfp: context for the allocation * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *sid, gfp_t gfp) { return security_context_to_sid_core(scontext, scontext_len, sid, SECSID_NULL, gfp, 0); } int security_context_str_to_sid(const char *scontext, u32 *sid, gfp_t gfp) { return security_context_to_sid(scontext, strlen(scontext), sid, gfp); } /** * security_context_to_sid_default - Obtain a SID for a given security context, * falling back to specified default if needed. * * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID * @def_sid: default SID to assign on error * @gfp_flags: the allocator get-free-page (GFP) flags * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * The default SID is passed to the MLS layer to be used to allow * kernel labeling of the MLS field if the MLS field is not present * (for upgrading to MLS without full relabel). * Implicitly forces adding of the context even if it cannot be mapped yet. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ int security_context_to_sid_default(const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags) { return security_context_to_sid_core(scontext, scontext_len, sid, def_sid, gfp_flags, 1); } int security_context_to_sid_force(const char *scontext, u32 scontext_len, u32 *sid) { return security_context_to_sid_core(scontext, scontext_len, sid, SECSID_NULL, GFP_KERNEL, 1); } static int compute_sid_handle_invalid_context( struct selinux_policy *policy, struct sidtab_entry *sentry, struct sidtab_entry *tentry, u16 tclass, struct context *newcontext) { struct policydb *policydb = &policy->policydb; struct sidtab *sidtab = policy->sidtab; char *s = NULL, *t = NULL, *n = NULL; u32 slen, tlen, nlen; struct audit_buffer *ab; if (sidtab_entry_to_string(policydb, sidtab, sentry, &s, &slen)) goto out; if (sidtab_entry_to_string(policydb, sidtab, tentry, &t, &tlen)) goto out; if (context_struct_to_string(policydb, newcontext, &n, &nlen)) goto out; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) goto out; audit_log_format(ab, "op=security_compute_sid invalid_context="); /* no need to record the NUL with untrusted strings */ audit_log_n_untrustedstring(ab, n, nlen - 1); audit_log_format(ab, " scontext=%s tcontext=%s tclass=%s", s, t, sym_name(policydb, SYM_CLASSES, tclass-1)); audit_log_end(ab); out: kfree(s); kfree(t); kfree(n); if (!enforcing_enabled()) return 0; return -EACCES; } static void filename_compute_type(struct policydb *policydb, struct context *newcontext, u32 stype, u32 ttype, u16 tclass, const char *objname) { struct filename_trans_key ft; struct filename_trans_datum *datum; /* * Most filename trans rules are going to live in specific directories * like /dev or /var/run. This bitmap will quickly skip rule searches * if the ttype does not contain any rules. */ if (!ebitmap_get_bit(&policydb->filename_trans_ttypes, ttype)) return; ft.ttype = ttype; ft.tclass = tclass; ft.name = objname; datum = policydb_filenametr_search(policydb, &ft); while (datum) { if (ebitmap_get_bit(&datum->stypes, stype - 1)) { newcontext->type = datum->otype; return; } datum = datum->next; } } static int security_compute_sid(u32 ssid, u32 tsid, u16 orig_tclass, u16 specified, const char *objname, u32 *out_sid, bool kern) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct class_datum *cladatum; struct context *scontext, *tcontext, newcontext; struct sidtab_entry *sentry, *tentry; struct avtab_key avkey; struct avtab_node *avnode, *node; u16 tclass; int rc = 0; bool sock; if (!selinux_initialized()) { switch (orig_tclass) { case SECCLASS_PROCESS: /* kernel value */ *out_sid = ssid; break; default: *out_sid = tsid; break; } goto out; } retry: cladatum = NULL; context_init(&newcontext); rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); if (kern) { tclass = unmap_class(&policy->map, orig_tclass); sock = security_is_socket_class(orig_tclass); } else { tclass = orig_tclass; sock = security_is_socket_class(map_class(&policy->map, tclass)); } policydb = &policy->policydb; sidtab = policy->sidtab; sentry = sidtab_search_entry(sidtab, ssid); if (!sentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); rc = -EINVAL; goto out_unlock; } tentry = sidtab_search_entry(sidtab, tsid); if (!tentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); rc = -EINVAL; goto out_unlock; } scontext = &sentry->context; tcontext = &tentry->context; if (tclass && tclass <= policydb->p_classes.nprim) cladatum = policydb->class_val_to_struct[tclass - 1]; /* Set the user identity. */ switch (specified) { case AVTAB_TRANSITION: case AVTAB_CHANGE: if (cladatum && cladatum->default_user == DEFAULT_TARGET) { newcontext.user = tcontext->user; } else { /* notice this gets both DEFAULT_SOURCE and unset */ /* Use the process user identity. */ newcontext.user = scontext->user; } break; case AVTAB_MEMBER: /* Use the related object owner. */ newcontext.user = tcontext->user; break; } /* Set the role to default values. */ if (cladatum && cladatum->default_role == DEFAULT_SOURCE) { newcontext.role = scontext->role; } else if (cladatum && cladatum->default_role == DEFAULT_TARGET) { newcontext.role = tcontext->role; } else { if ((tclass == policydb->process_class) || sock) newcontext.role = scontext->role; else newcontext.role = OBJECT_R_VAL; } /* Set the type. * Look for a type transition/member/change rule. */ avkey.source_type = scontext->type; avkey.target_type = tcontext->type; avkey.target_class = tclass; avkey.specified = specified; avnode = avtab_search_node(&policydb->te_avtab, &avkey); /* If no permanent rule, also check for enabled conditional rules */ if (!avnode) { node = avtab_search_node(&policydb->te_cond_avtab, &avkey); for (; node; node = avtab_search_node_next(node, specified)) { if (node->key.specified & AVTAB_ENABLED) { avnode = node; break; } } } /* If a permanent rule is found, use the type from * the type transition/member/change rule. Otherwise, * set the type to its default values. */ if (avnode) { newcontext.type = avnode->datum.u.data; } else if (cladatum && cladatum->default_type == DEFAULT_SOURCE) { newcontext.type = scontext->type; } else if (cladatum && cladatum->default_type == DEFAULT_TARGET) { newcontext.type = tcontext->type; } else { if ((tclass == policydb->process_class) || sock) { /* Use the type of process. */ newcontext.type = scontext->type; } else { /* Use the type of the related object. */ newcontext.type = tcontext->type; } } /* if we have a objname this is a file trans check so check those rules */ if (objname) filename_compute_type(policydb, &newcontext, scontext->type, tcontext->type, tclass, objname); /* Check for class-specific changes. */ if (specified & AVTAB_TRANSITION) { /* Look for a role transition rule. */ struct role_trans_datum *rtd; struct role_trans_key rtk = { .role = scontext->role, .type = tcontext->type, .tclass = tclass, }; rtd = policydb_roletr_search(policydb, &rtk); if (rtd) newcontext.role = rtd->new_role; } /* Set the MLS attributes. This is done last because it may allocate memory. */ rc = mls_compute_sid(policydb, scontext, tcontext, tclass, specified, &newcontext, sock); if (rc) goto out_unlock; /* Check the validity of the context. */ if (!policydb_context_isvalid(policydb, &newcontext)) { rc = compute_sid_handle_invalid_context(policy, sentry, tentry, tclass, &newcontext); if (rc) goto out_unlock; } /* Obtain the sid for the context. */ rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); context_destroy(&newcontext); goto retry; } out_unlock: rcu_read_unlock(); context_destroy(&newcontext); out: return rc; } /** * security_transition_sid - Compute the SID for a new subject/object. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @qstr: object name * @out_sid: security identifier for new subject/object * * Compute a SID to use for labeling a new subject or object in the * class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the new SID was * computed successfully. */ int security_transition_sid(u32 ssid, u32 tsid, u16 tclass, const struct qstr *qstr, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_TRANSITION, qstr ? qstr->name : NULL, out_sid, true); } int security_transition_sid_user(u32 ssid, u32 tsid, u16 tclass, const char *objname, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_TRANSITION, objname, out_sid, false); } /** * security_member_sid - Compute the SID for member selection. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use when selecting a member of a polyinstantiated * object of class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_member_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_MEMBER, NULL, out_sid, false); } /** * security_change_sid - Compute the SID for object relabeling. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use for relabeling an object of class @tclass * based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_change_sid(u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(ssid, tsid, tclass, AVTAB_CHANGE, NULL, out_sid, false); } static inline int convert_context_handle_invalid_context( struct policydb *policydb, struct context *context) { char *s; u32 len; if (enforcing_enabled()) return -EINVAL; if (!context_struct_to_string(policydb, context, &s, &len)) { pr_warn("SELinux: Context %s would be invalid if enforcing\n", s); kfree(s); } return 0; } /** * services_convert_context - Convert a security context across policies. * @args: populated convert_context_args struct * @oldc: original context * @newc: converted context * @gfp_flags: allocation flags * * Convert the values in the security context structure @oldc from the values * specified in the policy @args->oldp to the values specified in the policy * @args->newp, storing the new context in @newc, and verifying that the * context is valid under the new policy. */ int services_convert_context(struct convert_context_args *args, struct context *oldc, struct context *newc, gfp_t gfp_flags) { struct ocontext *oc; struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *s; u32 len; int rc; if (oldc->str) { s = kstrdup(oldc->str, gfp_flags); if (!s) return -ENOMEM; rc = string_to_context_struct(args->newp, NULL, s, newc, SECSID_NULL); if (rc == -EINVAL) { /* * Retain string representation for later mapping. * * IMPORTANT: We need to copy the contents of oldc->str * back into s again because string_to_context_struct() * may have garbled it. */ memcpy(s, oldc->str, oldc->len); context_init(newc); newc->str = s; newc->len = oldc->len; return 0; } kfree(s); if (rc) { /* Other error condition, e.g. ENOMEM. */ pr_err("SELinux: Unable to map context %s, rc = %d.\n", oldc->str, -rc); return rc; } pr_info("SELinux: Context %s became valid (mapped).\n", oldc->str); return 0; } context_init(newc); /* Convert the user. */ usrdatum = symtab_search(&args->newp->p_users, sym_name(args->oldp, SYM_USERS, oldc->user - 1)); if (!usrdatum) goto bad; newc->user = usrdatum->value; /* Convert the role. */ role = symtab_search(&args->newp->p_roles, sym_name(args->oldp, SYM_ROLES, oldc->role - 1)); if (!role) goto bad; newc->role = role->value; /* Convert the type. */ typdatum = symtab_search(&args->newp->p_types, sym_name(args->oldp, SYM_TYPES, oldc->type - 1)); if (!typdatum) goto bad; newc->type = typdatum->value; /* Convert the MLS fields if dealing with MLS policies */ if (args->oldp->mls_enabled && args->newp->mls_enabled) { rc = mls_convert_context(args->oldp, args->newp, oldc, newc); if (rc) goto bad; } else if (!args->oldp->mls_enabled && args->newp->mls_enabled) { /* * Switching between non-MLS and MLS policy: * ensure that the MLS fields of the context for all * existing entries in the sidtab are filled in with a * suitable default value, likely taken from one of the * initial SIDs. */ oc = args->newp->ocontexts[OCON_ISID]; while (oc && oc->sid[0] != SECINITSID_UNLABELED) oc = oc->next; if (!oc) { pr_err("SELinux: unable to look up" " the initial SIDs list\n"); goto bad; } rc = mls_range_set(newc, &oc->context[0].range); if (rc) goto bad; } /* Check the validity of the new context. */ if (!policydb_context_isvalid(args->newp, newc)) { rc = convert_context_handle_invalid_context(args->oldp, oldc); if (rc) goto bad; } return 0; bad: /* Map old representation to string and save it. */ rc = context_struct_to_string(args->oldp, oldc, &s, &len); if (rc) return rc; context_destroy(newc); newc->str = s; newc->len = len; pr_info("SELinux: Context %s became invalid (unmapped).\n", newc->str); return 0; } static void security_load_policycaps(struct selinux_policy *policy) { struct policydb *p; unsigned int i; struct ebitmap_node *node; p = &policy->policydb; for (i = 0; i < ARRAY_SIZE(selinux_state.policycap); i++) WRITE_ONCE(selinux_state.policycap[i], ebitmap_get_bit(&p->policycaps, i)); for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++) pr_info("SELinux: policy capability %s=%d\n", selinux_policycap_names[i], ebitmap_get_bit(&p->policycaps, i)); ebitmap_for_each_positive_bit(&p->policycaps, node, i) { if (i >= ARRAY_SIZE(selinux_policycap_names)) pr_info("SELinux: unknown policy capability %u\n", i); } } static int security_preserve_bools(struct selinux_policy *oldpolicy, struct selinux_policy *newpolicy); static void selinux_policy_free(struct selinux_policy *policy) { if (!policy) return; sidtab_destroy(policy->sidtab); kfree(policy->map.mapping); policydb_destroy(&policy->policydb); kfree(policy->sidtab); kfree(policy); } static void selinux_policy_cond_free(struct selinux_policy *policy) { cond_policydb_destroy_dup(&policy->policydb); kfree(policy); } void selinux_policy_cancel(struct selinux_load_state *load_state) { struct selinux_state *state = &selinux_state; struct selinux_policy *oldpolicy; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); sidtab_cancel_convert(oldpolicy->sidtab); selinux_policy_free(load_state->policy); kfree(load_state->convert_data); } static void selinux_notify_policy_change(u32 seqno) { /* Flush external caches and notify userspace of policy load */ avc_ss_reset(seqno); selnl_notify_policyload(seqno); selinux_status_update_policyload(seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); selinux_ima_measure_state_locked(); } void selinux_policy_commit(struct selinux_load_state *load_state) { struct selinux_state *state = &selinux_state; struct selinux_policy *oldpolicy, *newpolicy = load_state->policy; unsigned long flags; u32 seqno; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* If switching between different policy types, log MLS status */ if (oldpolicy) { if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled) pr_info("SELinux: Disabling MLS support...\n"); else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled) pr_info("SELinux: Enabling MLS support...\n"); } /* Set latest granting seqno for new policy. */ if (oldpolicy) newpolicy->latest_granting = oldpolicy->latest_granting + 1; else newpolicy->latest_granting = 1; seqno = newpolicy->latest_granting; /* Install the new policy. */ if (oldpolicy) { sidtab_freeze_begin(oldpolicy->sidtab, &flags); rcu_assign_pointer(state->policy, newpolicy); sidtab_freeze_end(oldpolicy->sidtab, &flags); } else { rcu_assign_pointer(state->policy, newpolicy); } /* Load the policycaps from the new policy */ security_load_policycaps(newpolicy); if (!selinux_initialized()) { /* * After first policy load, the security server is * marked as initialized and ready to handle requests and * any objects created prior to policy load are then labeled. */ selinux_mark_initialized(); selinux_complete_init(); } /* Free the old policy */ synchronize_rcu(); selinux_policy_free(oldpolicy); kfree(load_state->convert_data); /* Notify others of the policy change */ selinux_notify_policy_change(seqno); } /** * security_load_policy - Load a security policy configuration. * @data: binary policy data * @len: length of data in bytes * @load_state: policy load state * * Load a new set of security policy configuration data, * validate it and convert the SID table as necessary. * This function will flush the access vector cache after * loading the new policy. */ int security_load_policy(void *data, size_t len, struct selinux_load_state *load_state) { struct selinux_state *state = &selinux_state; struct selinux_policy *newpolicy, *oldpolicy; struct selinux_policy_convert_data *convert_data; int rc = 0; struct policy_file file = { data, len }, *fp = &file; newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL); if (!newpolicy) return -ENOMEM; newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL); if (!newpolicy->sidtab) { rc = -ENOMEM; goto err_policy; } rc = policydb_read(&newpolicy->policydb, fp); if (rc) goto err_sidtab; newpolicy->policydb.len = len; rc = selinux_set_mapping(&newpolicy->policydb, secclass_map, &newpolicy->map); if (rc) goto err_policydb; rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab); if (rc) { pr_err("SELinux: unable to load the initial SIDs\n"); goto err_mapping; } if (!selinux_initialized()) { /* First policy load, so no need to preserve state from old policy */ load_state->policy = newpolicy; load_state->convert_data = NULL; return 0; } oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* Preserve active boolean values from the old policy */ rc = security_preserve_bools(oldpolicy, newpolicy); if (rc) { pr_err("SELinux: unable to preserve booleans\n"); goto err_free_isids; } /* * Convert the internal representations of contexts * in the new SID table. */ convert_data = kmalloc(sizeof(*convert_data), GFP_KERNEL); if (!convert_data) { rc = -ENOMEM; goto err_free_isids; } convert_data->args.oldp = &oldpolicy->policydb; convert_data->args.newp = &newpolicy->policydb; convert_data->sidtab_params.args = &convert_data->args; convert_data->sidtab_params.target = newpolicy->sidtab; rc = sidtab_convert(oldpolicy->sidtab, &convert_data->sidtab_params); if (rc) { pr_err("SELinux: unable to convert the internal" " representation of contexts in the new SID" " table\n"); goto err_free_convert_data; } load_state->policy = newpolicy; load_state->convert_data = convert_data; return 0; err_free_convert_data: kfree(convert_data); err_free_isids: sidtab_destroy(newpolicy->sidtab); err_mapping: kfree(newpolicy->map.mapping); err_policydb: policydb_destroy(&newpolicy->policydb); err_sidtab: kfree(newpolicy->sidtab); err_policy: kfree(newpolicy); return rc; } /** * ocontext_to_sid - Helper to safely get sid for an ocontext * @sidtab: SID table * @c: ocontext structure * @index: index of the context entry (0 or 1) * @out_sid: pointer to the resulting SID value * * For all ocontexts except OCON_ISID the SID fields are populated * on-demand when needed. Since updating the SID value is an SMP-sensitive * operation, this helper must be used to do that safely. * * WARNING: This function may return -ESTALE, indicating that the caller * must retry the operation after re-acquiring the policy pointer! */ static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c, size_t index, u32 *out_sid) { int rc; u32 sid; /* Ensure the associated sidtab entry is visible to this thread. */ sid = smp_load_acquire(&c->sid[index]); if (!sid) { rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid); if (rc) return rc; /* * Ensure the new sidtab entry is visible to other threads * when they see the SID. */ smp_store_release(&c->sid[index], sid); } *out_sid = sid; return 0; } /** * security_port_sid - Obtain the SID for a port. * @protocol: protocol number * @port: port number * @out_sid: security identifier */ int security_port_sid(u8 protocol, u16 port, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized()) { *out_sid = SECINITSID_PORT; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_PORT]; while (c) { if (c->u.port.protocol == protocol && c->u.port.low_port <= port && c->u.port.high_port >= port) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { *out_sid = SECINITSID_PORT; } out: rcu_read_unlock(); return rc; } /** * security_ib_pkey_sid - Obtain the SID for a pkey. * @subnet_prefix: Subnet Prefix * @pkey_num: pkey number * @out_sid: security identifier */ int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized()) { *out_sid = SECINITSID_UNLABELED; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBPKEY]; while (c) { if (c->u.ibpkey.low_pkey <= pkey_num && c->u.ibpkey.high_pkey >= pkey_num && c->u.ibpkey.subnet_prefix == subnet_prefix) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *out_sid = SECINITSID_UNLABELED; out: rcu_read_unlock(); return rc; } /** * security_ib_endport_sid - Obtain the SID for a subnet management interface. * @dev_name: device name * @port_num: port number * @out_sid: security identifier */ int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized()) { *out_sid = SECINITSID_UNLABELED; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBENDPORT]; while (c) { if (c->u.ibendport.port == port_num && !strncmp(c->u.ibendport.dev_name, dev_name, IB_DEVICE_NAME_MAX)) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *out_sid = SECINITSID_UNLABELED; out: rcu_read_unlock(); return rc; } /** * security_netif_sid - Obtain the SID for a network interface. * @name: interface name * @if_sid: interface SID */ int security_netif_sid(const char *name, u32 *if_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; bool wildcard_support; if (!selinux_initialized()) { *if_sid = SECINITSID_NETIF; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; wildcard_support = ebitmap_get_bit(&policydb->policycaps, POLICYDB_CAP_NETIF_WILDCARD); c = policydb->ocontexts[OCON_NETIF]; while (c) { if (wildcard_support) { if (match_wildcard(c->u.name, name)) break; } else { if (strcmp(c->u.name, name) == 0) break; } c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, if_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *if_sid = SECINITSID_NETIF; out: rcu_read_unlock(); return rc; } static bool match_ipv6_addrmask(const u32 input[4], const u32 addr[4], const u32 mask[4]) { int i; for (i = 0; i < 4; i++) if (addr[i] != (input[i] & mask[i])) return false; return true; } /** * security_node_sid - Obtain the SID for a node (host). * @domain: communication domain aka address family * @addrp: address * @addrlen: address length in bytes * @out_sid: security identifier */ int security_node_sid(u16 domain, void *addrp, u32 addrlen, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; if (!selinux_initialized()) { *out_sid = SECINITSID_NODE; return 0; } retry: rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; switch (domain) { case AF_INET: { u32 addr; rc = -EINVAL; if (addrlen != sizeof(u32)) goto out; addr = *((u32 *)addrp); c = policydb->ocontexts[OCON_NODE]; while (c) { if (c->u.node.addr == (addr & c->u.node.mask)) break; c = c->next; } break; } case AF_INET6: rc = -EINVAL; if (addrlen != sizeof(u64) * 2) goto out; c = policydb->ocontexts[OCON_NODE6]; while (c) { if (match_ipv6_addrmask(addrp, c->u.node6.addr, c->u.node6.mask)) break; c = c->next; } break; default: rc = 0; *out_sid = SECINITSID_NODE; goto out; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { *out_sid = SECINITSID_NODE; } rc = 0; out: rcu_read_unlock(); return rc; } #define SIDS_NEL 25 /** * security_get_user_sids - Obtain reachable SIDs for a user. * @fromsid: starting SID * @username: username * @sids: array of reachable SIDs for user * @nel: number of elements in @sids * * Generate the set of SIDs for legal security contexts * for a given user that can be reached by @fromsid. * Set *@sids to point to a dynamically allocated * array containing the set of SIDs. Set *@nel to the * number of elements in the array. */ int security_get_user_sids(u32 fromsid, const char *username, u32 **sids, u32 *nel) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *fromcon, usercon; u32 *mysids = NULL, *mysids2, sid; u32 i, j, mynel, maxnel = SIDS_NEL; struct user_datum *user; struct role_datum *role; struct ebitmap_node *rnode, *tnode; int rc; *sids = NULL; *nel = 0; if (!selinux_initialized()) return 0; mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL); if (!mysids) return -ENOMEM; retry: mynel = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; context_init(&usercon); rc = -EINVAL; fromcon = sidtab_search(sidtab, fromsid); if (!fromcon) goto out_unlock; rc = -EINVAL; user = symtab_search(&policydb->p_users, username); if (!user) goto out_unlock; usercon.user = user->value; ebitmap_for_each_positive_bit(&user->roles, rnode, i) { role = policydb->role_val_to_struct[i]; usercon.role = i + 1; ebitmap_for_each_positive_bit(&role->types, tnode, j) { usercon.type = j + 1; if (mls_setup_user_range(policydb, fromcon, user, &usercon)) continue; rc = sidtab_context_to_sid(sidtab, &usercon, &sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out_unlock; if (mynel < maxnel) { mysids[mynel++] = sid; } else { rc = -ENOMEM; maxnel += SIDS_NEL; mysids2 = kcalloc(maxnel, sizeof(*mysids2), GFP_ATOMIC); if (!mysids2) goto out_unlock; memcpy(mysids2, mysids, mynel * sizeof(*mysids2)); kfree(mysids); mysids = mysids2; mysids[mynel++] = sid; } } } rc = 0; out_unlock: rcu_read_unlock(); if (rc || !mynel) { kfree(mysids); return rc; } rc = -ENOMEM; mysids2 = kcalloc(mynel, sizeof(*mysids2), GFP_KERNEL); if (!mysids2) { kfree(mysids); return rc; } for (i = 0, j = 0; i < mynel; i++) { struct av_decision dummy_avd; rc = avc_has_perm_noaudit(fromsid, mysids[i], SECCLASS_PROCESS, /* kernel value */ PROCESS__TRANSITION, AVC_STRICT, &dummy_avd); if (!rc) mysids2[j++] = mysids[i]; cond_resched(); } kfree(mysids); *sids = mysids2; *nel = j; return 0; } /** * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem * @policy: policy * @fstype: filesystem type * @path: path from root of mount * @orig_sclass: file security class * @sid: SID for path * * Obtain a SID to use for a file in a filesystem that * cannot support xattr or use a fixed labeling behavior like * transition SIDs or task SIDs. * * WARNING: This function may return -ESTALE, indicating that the caller * must retry the operation after re-acquiring the policy pointer! */ static inline int __security_genfs_sid(struct selinux_policy *policy, const char *fstype, const char *path, u16 orig_sclass, u32 *sid) { struct policydb *policydb = &policy->policydb; struct sidtab *sidtab = policy->sidtab; u16 sclass; struct genfs *genfs; struct ocontext *c; int cmp = 0; while (path[0] == '/' && path[1] == '/') path++; sclass = unmap_class(&policy->map, orig_sclass); *sid = SECINITSID_UNLABELED; for (genfs = policydb->genfs; genfs; genfs = genfs->next) { cmp = strcmp(fstype, genfs->fstype); if (cmp <= 0) break; } if (!genfs || cmp) return -ENOENT; for (c = genfs->head; c; c = c->next) { size_t len = strlen(c->u.name); if ((!c->v.sclass || sclass == c->v.sclass) && (strncmp(c->u.name, path, len) == 0)) break; } if (!c) return -ENOENT; return ocontext_to_sid(sidtab, c, 0, sid); } /** * security_genfs_sid - Obtain a SID for a file in a filesystem * @fstype: filesystem type * @path: path from root of mount * @orig_sclass: file security class * @sid: SID for path * * Acquire policy_rwlock before calling __security_genfs_sid() and release * it afterward. */ int security_genfs_sid(const char *fstype, const char *path, u16 orig_sclass, u32 *sid) { struct selinux_policy *policy; int retval; if (!selinux_initialized()) { *sid = SECINITSID_UNLABELED; return 0; } do { rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); retval = __security_genfs_sid(policy, fstype, path, orig_sclass, sid); rcu_read_unlock(); } while (retval == -ESTALE); return retval; } int selinux_policy_genfs_sid(struct selinux_policy *policy, const char *fstype, const char *path, u16 orig_sclass, u32 *sid) { /* no lock required, policy is not yet accessible by other threads */ return __security_genfs_sid(policy, fstype, path, orig_sclass, sid); } /** * security_fs_use - Determine how to handle labeling for a filesystem. * @sb: superblock in question */ int security_fs_use(struct super_block *sb) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; struct superblock_security_struct *sbsec = selinux_superblock(sb); const char *fstype = sb->s_type->name; if (!selinux_initialized()) { sbsec->behavior = SECURITY_FS_USE_NONE; sbsec->sid = SECINITSID_UNLABELED; return 0; } retry: rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_FSUSE]; while (c) { if (strcmp(fstype, c->u.name) == 0) break; c = c->next; } if (c) { sbsec->behavior = c->v.behavior; rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { rc = __security_genfs_sid(policy, fstype, "/", SECCLASS_DIR, &sbsec->sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) { sbsec->behavior = SECURITY_FS_USE_NONE; rc = 0; } else { sbsec->behavior = SECURITY_FS_USE_GENFS; } } out: rcu_read_unlock(); return rc; } int security_get_bools(struct selinux_policy *policy, u32 *len, char ***names, int **values) { struct policydb *policydb; u32 i; int rc; policydb = &policy->policydb; *names = NULL; *values = NULL; rc = 0; *len = policydb->p_bools.nprim; if (!*len) goto out; rc = -ENOMEM; *names = kcalloc(*len, sizeof(char *), GFP_ATOMIC); if (!*names) goto err; rc = -ENOMEM; *values = kcalloc(*len, sizeof(int), GFP_ATOMIC); if (!*values) goto err; for (i = 0; i < *len; i++) { (*values)[i] = policydb->bool_val_to_struct[i]->state; rc = -ENOMEM; (*names)[i] = kstrdup(sym_name(policydb, SYM_BOOLS, i), GFP_ATOMIC); if (!(*names)[i]) goto err; } rc = 0; out: return rc; err: if (*names) { for (i = 0; i < *len; i++) kfree((*names)[i]); kfree(*names); } kfree(*values); *len = 0; *names = NULL; *values = NULL; goto out; } int security_set_bools(u32 len, const int *values) { struct selinux_state *state = &selinux_state; struct selinux_policy *newpolicy, *oldpolicy; int rc; u32 i, seqno = 0; if (!selinux_initialized()) return -EINVAL; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* Consistency check on number of booleans, should never fail */ if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim)) return -EINVAL; newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL); if (!newpolicy) return -ENOMEM; /* * Deep copy only the parts of the policydb that might be * modified as a result of changing booleans. */ rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb); if (rc) { kfree(newpolicy); return -ENOMEM; } /* Update the boolean states in the copy */ for (i = 0; i < len; i++) { int new_state = !!values[i]; int old_state = newpolicy->policydb.bool_val_to_struct[i]->state; if (new_state != old_state) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_MAC_CONFIG_CHANGE, "bool=%s val=%d old_val=%d auid=%u ses=%u", sym_name(&newpolicy->policydb, SYM_BOOLS, i), new_state, old_state, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); newpolicy->policydb.bool_val_to_struct[i]->state = new_state; } } /* Re-evaluate the conditional rules in the copy */ evaluate_cond_nodes(&newpolicy->policydb); /* Set latest granting seqno for new policy */ newpolicy->latest_granting = oldpolicy->latest_granting + 1; seqno = newpolicy->latest_granting; /* Install the new policy */ rcu_assign_pointer(state->policy, newpolicy); /* * Free the conditional portions of the old policydb * that were copied for the new policy, and the oldpolicy * structure itself but not what it references. */ synchronize_rcu(); selinux_policy_cond_free(oldpolicy); /* Notify others of the policy change */ selinux_notify_policy_change(seqno); return 0; } int security_get_bool_value(u32 index) { struct selinux_policy *policy; struct policydb *policydb; int rc; u32 len; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; rc = -EFAULT; len = policydb->p_bools.nprim; if (index >= len) goto out; rc = policydb->bool_val_to_struct[index]->state; out: rcu_read_unlock(); return rc; } static int security_preserve_bools(struct selinux_policy *oldpolicy, struct selinux_policy *newpolicy) { int rc, *bvalues = NULL; char **bnames = NULL; struct cond_bool_datum *booldatum; u32 i, nbools = 0; rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues); if (rc) goto out; for (i = 0; i < nbools; i++) { booldatum = symtab_search(&newpolicy->policydb.p_bools, bnames[i]); if (booldatum) booldatum->state = bvalues[i]; } evaluate_cond_nodes(&newpolicy->policydb); out: if (bnames) { for (i = 0; i < nbools; i++) kfree(bnames[i]); } kfree(bnames); kfree(bvalues); return rc; } /* * security_sid_mls_copy() - computes a new sid based on the given * sid and the mls portion of mls_sid. */ int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *context1; struct context *context2; struct context newcon; char *s; u32 len; int rc; if (!selinux_initialized()) { *new_sid = sid; return 0; } retry: rc = 0; context_init(&newcon); rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (!policydb->mls_enabled) { *new_sid = sid; goto out_unlock; } rc = -EINVAL; context1 = sidtab_search(sidtab, sid); if (!context1) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, sid); goto out_unlock; } rc = -EINVAL; context2 = sidtab_search(sidtab, mls_sid); if (!context2) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, mls_sid); goto out_unlock; } newcon.user = context1->user; newcon.role = context1->role; newcon.type = context1->type; rc = mls_context_cpy(&newcon, context2); if (rc) goto out_unlock; /* Check the validity of the new context. */ if (!policydb_context_isvalid(policydb, &newcon)) { rc = convert_context_handle_invalid_context(policydb, &newcon); if (rc) { if (!context_struct_to_string(policydb, &newcon, &s, &len)) { struct audit_buffer *ab; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); audit_log_format(ab, "op=security_sid_mls_copy invalid_context="); /* don't record NUL with untrusted strings */ audit_log_n_untrustedstring(ab, s, len - 1); audit_log_end(ab); kfree(s); } goto out_unlock; } } rc = sidtab_context_to_sid(sidtab, &newcon, new_sid); if (rc == -ESTALE) { rcu_read_unlock(); context_destroy(&newcon); goto retry; } out_unlock: rcu_read_unlock(); context_destroy(&newcon); return rc; } /** * security_net_peersid_resolve - Compare and resolve two network peer SIDs * @nlbl_sid: NetLabel SID * @nlbl_type: NetLabel labeling protocol type * @xfrm_sid: XFRM SID * @peer_sid: network peer sid * * Description: * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be * resolved into a single SID it is returned via @peer_sid and the function * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function * returns a negative value. A table summarizing the behavior is below: * * | function return | @sid * ------------------------------+-----------------+----------------- * no peer labels | 0 | SECSID_NULL * single peer label | 0 | <peer_label> * multiple, consistent labels | 0 | <peer_label> * multiple, inconsistent labels | -<errno> | SECSID_NULL * */ int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type, u32 xfrm_sid, u32 *peer_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct context *nlbl_ctx; struct context *xfrm_ctx; *peer_sid = SECSID_NULL; /* handle the common (which also happens to be the set of easy) cases * right away, these two if statements catch everything involving a * single or absent peer SID/label */ if (xfrm_sid == SECSID_NULL) { *peer_sid = nlbl_sid; return 0; } /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label * is present */ if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) { *peer_sid = xfrm_sid; return 0; } if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; /* * We don't need to check initialized here since the only way both * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the * security server was initialized and state->initialized was true. */ if (!policydb->mls_enabled) { rc = 0; goto out; } rc = -EINVAL; nlbl_ctx = sidtab_search(sidtab, nlbl_sid); if (!nlbl_ctx) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, nlbl_sid); goto out; } rc = -EINVAL; xfrm_ctx = sidtab_search(sidtab, xfrm_sid); if (!xfrm_ctx) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, xfrm_sid); goto out; } rc = (mls_context_equal(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES); if (rc) goto out; /* at present NetLabel SIDs/labels really only carry MLS * information so if the MLS portion of the NetLabel SID * matches the MLS portion of the labeled XFRM SID/label * then pass along the XFRM SID as it is the most * expressive */ *peer_sid = xfrm_sid; out: rcu_read_unlock(); return rc; } static int get_classes_callback(void *k, void *d, void *args) { struct class_datum *datum = d; char *name = k, **classes = args; u32 value = datum->value - 1; classes[value] = kstrdup(name, GFP_ATOMIC); if (!classes[value]) return -ENOMEM; return 0; } int security_get_classes(struct selinux_policy *policy, char ***classes, u32 *nclasses) { struct policydb *policydb; int rc; policydb = &policy->policydb; rc = -ENOMEM; *nclasses = policydb->p_classes.nprim; *classes = kcalloc(*nclasses, sizeof(**classes), GFP_ATOMIC); if (!*classes) goto out; rc = hashtab_map(&policydb->p_classes.table, get_classes_callback, *classes); if (rc) { u32 i; for (i = 0; i < *nclasses; i++) kfree((*classes)[i]); kfree(*classes); } out: return rc; } static int get_permissions_callback(void *k, void *d, void *args) { struct perm_datum *datum = d; char *name = k, **perms = args; u32 value = datum->value - 1; perms[value] = kstrdup(name, GFP_ATOMIC); if (!perms[value]) return -ENOMEM; return 0; } int security_get_permissions(struct selinux_policy *policy, const char *class, char ***perms, u32 *nperms) { struct policydb *policydb; u32 i; int rc; struct class_datum *match; policydb = &policy->policydb; rc = -EINVAL; match = symtab_search(&policydb->p_classes, class); if (!match) { pr_err("SELinux: %s: unrecognized class %s\n", __func__, class); goto out; } rc = -ENOMEM; *nperms = match->permissions.nprim; *perms = kcalloc(*nperms, sizeof(**perms), GFP_ATOMIC); if (!*perms) goto out; if (match->comdatum) { rc = hashtab_map(&match->comdatum->permissions.table, get_permissions_callback, *perms); if (rc) goto err; } rc = hashtab_map(&match->permissions.table, get_permissions_callback, *perms); if (rc) goto err; out: return rc; err: for (i = 0; i < *nperms; i++) kfree((*perms)[i]); kfree(*perms); return rc; } int security_get_reject_unknown(void) { struct selinux_policy *policy; int value; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); value = policy->policydb.reject_unknown; rcu_read_unlock(); return value; } int security_get_allow_unknown(void) { struct selinux_policy *policy; int value; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); value = policy->policydb.allow_unknown; rcu_read_unlock(); return value; } /** * security_policycap_supported - Check for a specific policy capability * @req_cap: capability * * Description: * This function queries the currently loaded policy to see if it supports the * capability specified by @req_cap. Returns true (1) if the capability is * supported, false (0) if it isn't supported. * */ int security_policycap_supported(unsigned int req_cap) { struct selinux_policy *policy; int rc; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap); rcu_read_unlock(); return rc; } struct selinux_audit_rule { u32 au_seqno; struct context au_ctxt; }; void selinux_audit_rule_free(void *vrule) { struct selinux_audit_rule *rule = vrule; if (rule) { context_destroy(&rule->au_ctxt); kfree(rule); } } int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; struct policydb *policydb; struct selinux_audit_rule *tmprule; struct role_datum *roledatum; struct type_datum *typedatum; struct user_datum *userdatum; struct selinux_audit_rule **rule = (struct selinux_audit_rule **)vrule; int rc = 0; *rule = NULL; if (!selinux_initialized()) return -EOPNOTSUPP; switch (field) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: /* only 'equals' and 'not equals' fit user, role, and type */ if (op != Audit_equal && op != Audit_not_equal) return -EINVAL; break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: /* we do not allow a range, indicated by the presence of '-' */ if (strchr(rulestr, '-')) return -EINVAL; break; default: /* only the above fields are valid */ return -EINVAL; } tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp); if (!tmprule) return -ENOMEM; context_init(&tmprule->au_ctxt); rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; tmprule->au_seqno = policy->latest_granting; switch (field) { case AUDIT_SUBJ_USER: case AUDIT_OBJ_USER: userdatum = symtab_search(&policydb->p_users, rulestr); if (!userdatum) { rc = -EINVAL; goto err; } tmprule->au_ctxt.user = userdatum->value; break; case AUDIT_SUBJ_ROLE: case AUDIT_OBJ_ROLE: roledatum = symtab_search(&policydb->p_roles, rulestr); if (!roledatum) { rc = -EINVAL; goto err; } tmprule->au_ctxt.role = roledatum->value; break; case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_TYPE: typedatum = symtab_search(&policydb->p_types, rulestr); if (!typedatum) { rc = -EINVAL; goto err; } tmprule->au_ctxt.type = typedatum->value; break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: rc = mls_from_string(policydb, rulestr, &tmprule->au_ctxt, GFP_ATOMIC); if (rc) goto err; break; } rcu_read_unlock(); *rule = tmprule; return 0; err: rcu_read_unlock(); selinux_audit_rule_free(tmprule); *rule = NULL; return rc; } /* Check to see if the rule contains any selinux fields */ int selinux_audit_rule_known(struct audit_krule *rule) { u32 i; for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: return 1; } } return 0; } int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; struct context *ctxt; struct mls_level *level; struct selinux_audit_rule *rule = vrule; int match = 0; if (unlikely(!rule)) { WARN_ONCE(1, "selinux_audit_rule_match: missing rule\n"); return -ENOENT; } if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); if (rule->au_seqno < policy->latest_granting) { match = -ESTALE; goto out; } ctxt = sidtab_search(policy->sidtab, prop->selinux.secid); if (unlikely(!ctxt)) { WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n", prop->selinux.secid); match = -ENOENT; goto out; } /* a field/op pair that is not caught here will simply fall through without a match */ switch (field) { case AUDIT_SUBJ_USER: case AUDIT_OBJ_USER: switch (op) { case Audit_equal: match = (ctxt->user == rule->au_ctxt.user); break; case Audit_not_equal: match = (ctxt->user != rule->au_ctxt.user); break; } break; case AUDIT_SUBJ_ROLE: case AUDIT_OBJ_ROLE: switch (op) { case Audit_equal: match = (ctxt->role == rule->au_ctxt.role); break; case Audit_not_equal: match = (ctxt->role != rule->au_ctxt.role); break; } break; case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_TYPE: switch (op) { case Audit_equal: match = (ctxt->type == rule->au_ctxt.type); break; case Audit_not_equal: match = (ctxt->type != rule->au_ctxt.type); break; } break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: level = ((field == AUDIT_SUBJ_SEN || field == AUDIT_OBJ_LEV_LOW) ? &ctxt->range.level[0] : &ctxt->range.level[1]); switch (op) { case Audit_equal: match = mls_level_eq(&rule->au_ctxt.range.level[0], level); break; case Audit_not_equal: match = !mls_level_eq(&rule->au_ctxt.range.level[0], level); break; case Audit_lt: match = (mls_level_dom(&rule->au_ctxt.range.level[0], level) && !mls_level_eq(&rule->au_ctxt.range.level[0], level)); break; case Audit_le: match = mls_level_dom(&rule->au_ctxt.range.level[0], level); break; case Audit_gt: match = (mls_level_dom(level, &rule->au_ctxt.range.level[0]) && !mls_level_eq(level, &rule->au_ctxt.range.level[0])); break; case Audit_ge: match = mls_level_dom(level, &rule->au_ctxt.range.level[0]); break; } } out: rcu_read_unlock(); return match; } static int aurule_avc_callback(u32 event) { if (event == AVC_CALLBACK_RESET) return audit_update_lsm_rules(); return 0; } static int __init aurule_init(void) { int err; err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET); if (err) panic("avc_add_callback() failed, error %d\n", err); return err; } __initcall(aurule_init); #ifdef CONFIG_NETLABEL /** * security_netlbl_cache_add - Add an entry to the NetLabel cache * @secattr: the NetLabel packet security attributes * @sid: the SELinux SID * * Description: * Attempt to cache the context in @ctx, which was derived from the packet in * @skb, in the NetLabel subsystem cache. This function assumes @secattr has * already been initialized. * */ static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, u32 sid) { u32 *sid_cache; sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC); if (sid_cache == NULL) return; secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); if (secattr->cache == NULL) { kfree(sid_cache); return; } *sid_cache = sid; secattr->cache->free = kfree; secattr->cache->data = sid_cache; secattr->flags |= NETLBL_SECATTR_CACHE; } /** * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID * @secattr: the NetLabel packet security attributes * @sid: the SELinux SID * * Description: * Convert the given NetLabel security attributes in @secattr into a * SELinux SID. If the @secattr field does not contain a full SELinux * SID/context then use SECINITSID_NETMSG as the foundation. If possible the * 'cache' field of @secattr is set and the CACHE flag is set; this is to * allow the @secattr to be used by NetLabel to cache the secattr to SID * conversion for future lookups. Returns zero on success, negative values on * failure. * */ int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr, u32 *sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct context *ctx; struct context ctx_new; if (!selinux_initialized()) { *sid = SECSID_NULL; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (secattr->flags & NETLBL_SECATTR_CACHE) *sid = *(u32 *)secattr->cache->data; else if (secattr->flags & NETLBL_SECATTR_SECID) *sid = secattr->attr.secid; else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { rc = -EIDRM; ctx = sidtab_search(sidtab, SECINITSID_NETMSG); if (ctx == NULL) goto out; context_init(&ctx_new); ctx_new.user = ctx->user; ctx_new.role = ctx->role; ctx_new.type = ctx->type; mls_import_netlbl_lvl(policydb, &ctx_new, secattr); if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { rc = mls_import_netlbl_cat(policydb, &ctx_new, secattr); if (rc) goto out; } rc = -EIDRM; if (!mls_context_isvalid(policydb, &ctx_new)) { ebitmap_destroy(&ctx_new.range.level[0].cat); goto out; } rc = sidtab_context_to_sid(sidtab, &ctx_new, sid); ebitmap_destroy(&ctx_new.range.level[0].cat); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; security_netlbl_cache_add(secattr, *sid); } else *sid = SECSID_NULL; out: rcu_read_unlock(); return rc; } /** * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr * @sid: the SELinux SID * @secattr: the NetLabel packet security attributes * * Description: * Convert the given SELinux SID in @sid into a NetLabel security attribute. * Returns zero on success, negative values on failure. * */ int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr) { struct selinux_policy *policy; struct policydb *policydb; int rc; struct context *ctx; if (!selinux_initialized()) return 0; rcu_read_lock(); policy = rcu_dereference(selinux_state.policy); policydb = &policy->policydb; rc = -ENOENT; ctx = sidtab_search(policy->sidtab, sid); if (ctx == NULL) goto out; rc = -ENOMEM; secattr->domain = kstrdup(sym_name(policydb, SYM_TYPES, ctx->type - 1), GFP_ATOMIC); if (secattr->domain == NULL) goto out; secattr->attr.secid = sid; secattr->flags |= NETLBL_SECATTR_DOMAIN_CPY | NETLBL_SECATTR_SECID; mls_export_netlbl_lvl(policydb, ctx, secattr); rc = mls_export_netlbl_cat(policydb, ctx, secattr); out: rcu_read_unlock(); return rc; } #endif /* CONFIG_NETLABEL */ /** * __security_read_policy - read the policy. * @policy: SELinux policy * @data: binary policy data * @len: length of data in bytes * */ static int __security_read_policy(struct selinux_policy *policy, void *data, size_t *len) { int rc; struct policy_file fp; fp.data = data; fp.len = *len; rc = policydb_write(&policy->policydb, &fp); if (rc) return rc; *len = (unsigned long)fp.data - (unsigned long)data; return 0; } /** * security_read_policy - read the policy. * @data: binary policy data * @len: length of data in bytes * */ int security_read_policy(void **data, size_t *len) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; policy = rcu_dereference_protected( state->policy, lockdep_is_held(&state->policy_mutex)); if (!policy) return -EINVAL; *len = policy->policydb.len; *data = vmalloc_user(*len); if (!*data) return -ENOMEM; return __security_read_policy(policy, *data, len); } /** * security_read_state_kernel - read the policy. * @data: binary policy data * @len: length of data in bytes * * Allocates kernel memory for reading SELinux policy. * This function is for internal use only and should not * be used for returning data to user space. * * This function must be called with policy_mutex held. */ int security_read_state_kernel(void **data, size_t *len) { int err; struct selinux_state *state = &selinux_state; struct selinux_policy *policy; policy = rcu_dereference_protected( state->policy, lockdep_is_held(&state->policy_mutex)); if (!policy) return -EINVAL; *len = policy->policydb.len; *data = vmalloc(*len); if (!*data) return -ENOMEM; err = __security_read_policy(policy, *data, len); if (err) { vfree(*data); *data = NULL; *len = 0; } return err; }
19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0-only /* * Lock-less NULL terminated single linked list * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/llist.h> /** * llist_add_batch - add several linked entries in batch * @new_first: first entry in batch to be added * @new_last: last entry in batch to be added * @head: the head for your lock-less list * * Return whether list is empty before adding. */ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head) { struct llist_node *first = READ_ONCE(head->first); do { new_last->next = first; } while (!try_cmpxchg(&head->first, &first, new_first)); return !first; } EXPORT_SYMBOL_GPL(llist_add_batch); /** * llist_del_first - delete the first entry of lock-less list * @head: the head for your lock-less list * * If list is empty, return NULL, otherwise, return the first entry * deleted, this is the newest added one. * * Only one llist_del_first user can be used simultaneously with * multiple llist_add users without lock. Because otherwise * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add, * llist_add) sequence in another user may change @head->first->next, * but keep @head->first. If multiple consumers are needed, please * use llist_del_all or use lock between consumers. */ struct llist_node *llist_del_first(struct llist_head *head) { struct llist_node *entry, *next; entry = smp_load_acquire(&head->first); do { if (entry == NULL) return NULL; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return entry; } EXPORT_SYMBOL_GPL(llist_del_first); /** * llist_del_first_this - delete given entry of lock-less list if it is first * @head: the head for your lock-less list * @this: a list entry. * * If head of the list is given entry, delete and return %true else * return %false. * * Multiple callers can safely call this concurrently with multiple * llist_add() callers, providing all the callers offer a different @this. */ bool llist_del_first_this(struct llist_head *head, struct llist_node *this) { struct llist_node *entry, *next; /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */ entry = smp_load_acquire(&head->first); do { if (entry != this) return false; next = READ_ONCE(entry->next); } while (!try_cmpxchg(&head->first, &entry, next)); return true; } EXPORT_SYMBOL_GPL(llist_del_first_this); /** * llist_reverse_order - reverse order of a llist chain * @head: first item of the list to be reversed * * Reverse the order of a chain of llist entries and return the * new first entry. */ struct llist_node *llist_reverse_order(struct llist_node *head) { struct llist_node *new_head = NULL; while (head) { struct llist_node *tmp = head; head = head->next; tmp->next = new_head; new_head = tmp; } return new_head; } EXPORT_SYMBOL_GPL(llist_reverse_order);
272 274 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal schedule timeout and sleeping functions */ #include <linux/delay.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include "tick-internal.h" /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ struct process_timer { struct timer_list timer; struct task_struct *task; }; static void process_timeout(struct timer_list *t) { struct process_timer *timeout = from_timer(timeout, t, timer); wake_up_process(timeout->task); } /** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. * The function behavior depends on the current task state * (see also set_current_state() description): * * %TASK_RUNNING - the scheduler is called, but the task does not sleep * at all. That happens because sched_submit_work() does nothing for * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns: 0 when the timer has expired otherwise the remaining time in * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { struct process_timer timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { pr_err("%s: wrong timeout value %lx\n", __func__, timeout); dump_stack(); __set_current_state(TASK_RUNNING); goto out; } } expire = timeout + jiffies; timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); timer.timer.expires = expire; add_timer(&timer.timer); schedule(); timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer.timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout); /* * __set_current_state() can be used in schedule_timeout_*() functions, because * schedule_timeout() calls schedule() unconditionally. */ /** * schedule_timeout_interruptible - sleep until timeout (interruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_INTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); /** * schedule_timeout_killable - sleep until timeout (killable) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_KILLABLE before starting the timeout. */ signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_killable); /** * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout. */ signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); /** * schedule_timeout_idle - sleep until timeout (idle) * @timeout: timeout value in jiffies * * See schedule_timeout() for details. * * Task state is set to TASK_IDLE before starting the timeout. It is similar to * schedule_timeout_uninterruptible(), except this task will not contribute to * load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { __set_current_state(TASK_IDLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_idle); /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used * * Details are explained in schedule_hrtimeout_range() function description as * this function is commonly used. */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } hrtimer_setup_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns: 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * See schedule_hrtimeout_range() for details. @delta argument of * schedule_hrtimeout_range() is set to 0 and has therefore no impact. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout); /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Requested sleep duration in milliseconds * * msleep() uses jiffy based timeouts for the sleep duration. Because of the * design of the timer wheel, the maximum additional percentage delay (slack) is * 12.5%. This is only valid for timers which will end up in level 1 or a higher * level of the timer wheel. For explanation of those 12.5% please check the * detailed description about the basics of the timer wheel. * * The slack of timers which will end up in level 0 depends on sleep duration * (msecs) and HZ configuration and can be calculated in the following way (with * the timer wheel design restriction that the slack is not less than 12.5%): * * ``slack = MSECS_PER_TICK / msecs`` * * When the allowed slack of the callsite is known, the calculation could be * turned around to find the minimal allowed sleep duration to meet the * constraints. For example: * * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``: * all sleep durations greater or equal 4ms will meet the constraints. * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``: * all sleep durations greater or equal 8ms will meet the constraints. * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``: * all sleep durations greater or equal 16ms will meet the constraints. * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``: * all sleep durations greater or equal 32ms will meet the constraints. * * See also the signal aware variant msleep_interruptible(). */ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals * @msecs: Requested sleep duration in milliseconds * * See msleep() for some basic information. * * The difference between msleep() and msleep_interruptible() is that the sleep * could be interrupted by a signal delivery and then returns early. * * Returns: The remaining time of the sleep duration transformed to msecs (see * schedule_timeout() for details). */ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); /** * usleep_range_state - Sleep for an approximate time in a given state * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * * usleep_range_state() sleeps at least for the minimum specified time but not * longer than the maximum specified amount of time. The range might reduce * power usage by allowing hrtimers to coalesce an already scheduled interrupt * with this hrtimer. In the worst case, an interrupt is scheduled for the upper * bound. * * The sleeping task is set to the specified state before starting the sleep. * * In non-atomic context where the exact wakeup time is flexible, use * usleep_range() or its variants instead of udelay(). The sleep improves * responsiveness by avoiding the CPU-hogging busy-wait of udelay(). */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; if (WARN_ON_ONCE(max < min)) delta = 0; for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } EXPORT_SYMBOL(usleep_range_state);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 #ifndef LLC_H #define LLC_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rculist_nulls.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/atomic.h> struct net_device; struct packet_type; struct sk_buff; struct llc_addr { unsigned char lsap; unsigned char mac[IFHWADDRLEN]; }; #define LLC_SAP_STATE_INACTIVE 1 #define LLC_SAP_STATE_ACTIVE 2 #define LLC_SK_DEV_HASH_BITS 6 #define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS) #define LLC_SK_LADDR_HASH_BITS 6 #define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS) /** * struct llc_sap - Defines the SAP component * * @station - station this sap belongs to * @state - sap state * @p_bit - only lowest-order bit used * @f_bit - only lowest-order bit used * @laddr - SAP value in this 'lsap' * @node - entry in station sap_list * @sk_list - LLC sockets this one manages */ struct llc_sap { unsigned char state; unsigned char p_bit; unsigned char f_bit; refcount_t refcnt; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; spinlock_t sk_lock; int sk_count; struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES]; struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES]; struct rcu_head rcu; }; static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); return &sap->sk_dev_hash[bucket]; } static inline u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr) { return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0), LLC_SK_LADDR_HASH_BITS); } static inline struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap, const struct llc_addr *laddr) { return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)]; } #define LLC_DEST_INVALID 0 /* Invalid LLC PDU type */ #define LLC_DEST_SAP 1 /* Type 1 goes here */ #define LLC_DEST_CONN 2 /* Type 2 goes here */ extern struct list_head llc_sap_list; int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)); void llc_remove_pack(int type); void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)); static inline void llc_sap_hold(struct llc_sap *sap) { refcount_inc(&sap->refcnt); } static inline bool llc_sap_hold_safe(struct llc_sap *sap) { return refcount_inc_not_zero(&sap->refcnt); } void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) { if (refcount_dec_and_test(&sap->refcnt)) llc_sap_close(sap); } struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_station_init(void); void llc_station_exit(void); #ifdef CONFIG_PROC_FS int llc_proc_init(void); void llc_proc_exit(void); #else #define llc_proc_init() (0) #define llc_proc_exit() do { } while(0) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL int llc_sysctl_init(void); void llc_sysctl_exit(void); extern int sysctl_llc2_ack_timeout; extern int sysctl_llc2_busy_timeout; extern int sysctl_llc2_p_timeout; extern int sysctl_llc2_rej_timeout; #else #define llc_sysctl_init() (0) #define llc_sysctl_exit() do { } while(0) #endif /* CONFIG_SYSCTL */ #endif /* LLC_H */
1415 1418 1419 1418 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BH_H #define _LINUX_BH_H #include <linux/instruction_pointer.h> #include <linux/preempt.h> #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { preempt_count_add(cnt); barrier(); } #endif static inline void local_bh_disable(void) { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } extern void _local_bh_enable(void); extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); static inline void local_bh_enable_ip(unsigned long ip) { __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); } static inline void local_bh_enable(void) { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } #ifdef CONFIG_PREEMPT_RT extern bool local_bh_blocked(void); #else static inline bool local_bh_blocked(void) { return false; } #endif #endif /* _LINUX_BH_H */
142 142 142 57 57 120 120 120 135 135 54 54 54 57 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the diskquota system for the LINUX operating system. QUOTA * is implemented using the BSD system call interface as the means of * communication with the user level. This file contains the generic routines * called by the different filesystems on allocation of an inode or block. * These routines take care of the administration needed to have a consistent * diskquota tracking system. The ideas of both user and group quotas are based * on the Melbourne quota system as used on BSD derived systems. The internal * implementation is based on one of the several variants of the LINUX * inode-subsystem with added complexity of the diskquota system. * * Author: Marco van Wieringen <mvw@planets.elm.net> * * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96 * * Revised list management to avoid races * -- Bill Hawes, <whawes@star.net>, 9/98 * * Fixed races in dquot_transfer(), dqget() and dquot_alloc_...(). * As the consequence the locking was moved from dquot_decr_...(), * dquot_incr_...() to calling functions. * invalidate_dquots() now writes modified dquots. * Serialized quota_off() and quota_on() for mount point. * Fixed a few bugs in grow_dquots(). * Fixed deadlock in write_dquot() - we no longer account quotas on * quota files * remove_dquot_ref() moved to inode.c - it now traverses through inodes * add_dquot_ref() restarts after blocking * Added check for bogus uid and fixed check for group in quotactl. * Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99 * * Used struct list_head instead of own list struct * Invalidation of referenced dquots is no longer possible * Improved free_dquots list management * Quota and i_blocks are now updated in one place to avoid races * Warnings are now delayed so we won't block in critical section * Write updated not to require dquot lock * Jan Kara, <jack@suse.cz>, 9/2000 * * Added dynamic quota structure allocation * Jan Kara <jack@suse.cz> 12/2000 * * Rewritten quota interface. Implemented new quota format and * formats registering. * Jan Kara, <jack@suse.cz>, 2001,2002 * * New SMP locking. * Jan Kara, <jack@suse.cz>, 10/2002 * * Added journalled quota support, fix lock inversion problems * Jan Kara, <jack@suse.cz>, 2003,2004 * * (C) Copyright 1994 - 1997 Marco van Wieringen */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/mm.h> #include <linux/time.h> #include <linux/types.h> #include <linux/string.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/tty.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/kmod.h> #include <linux/namei.h> #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> /* * There are five quota SMP locks: * * dq_list_lock protects all lists with quotas and quota formats. * * dquot->dq_dqb_lock protects data from dq_dqb * * inode->i_lock protects inode->i_blocks, i_bytes and also guards * consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that * dquot_transfer() can stabilize amount it transfers * * dq_data_lock protects mem_dqinfo structures and modifications of dquot * pointers in the inode * * dq_state_lock protects modifications of quota state (on quotaon and * quotaoff) and readers who care about latest values take it as well. * * The spinlock ordering is hence: * dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock, * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock * * Operation accessing dquots via inode pointers are protected by dquot_srcu. * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and * synchronize_srcu(&dquot_srcu) is called after clearing pointers from * inode and before dropping dquot references to avoid use of dquots after * they are freed. dq_data_lock is used to serialize the pointer setting and * clearing operations. * Special care needs to be taken about S_NOQUOTA inode flag (marking that * inode is a quota file). Functions adding pointers from inode to dquots have * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they * have to do all pointer modifications before dropping dq_data_lock. This makes * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * * Each dquot has its dq_lock mutex. Dquot is locked when it is being read to * memory (or space for it is being allocated) on the first dqget(), when it is * being written out, and when it is being released on the last dqput(). The * allocation and release operations are serialized by the dq_lock and by * checking the use count in dquot_release(). * * Lock ordering (including related VFS locks) is the following: * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); DEFINE_STATIC_SRCU(dquot_srcu); static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq); void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...) { if (printk_ratelimit()) { va_list args; struct va_format vaf; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_ERR "Quota error (device %s): %s: %pV\n", sb->s_id, func, &vaf); va_end(args); } } EXPORT_SYMBOL(__quota_error); #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) static char *quotatypes[] = INITQFNAMES; #endif static struct quota_format_type *quota_formats; /* List of registered formats */ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; /* SLAB cache for dquot structures */ static struct kmem_cache *dquot_cachep; void register_quota_format(struct quota_format_type *fmt) { spin_lock(&dq_list_lock); fmt->qf_next = quota_formats; quota_formats = fmt; spin_unlock(&dq_list_lock); } EXPORT_SYMBOL(register_quota_format); void unregister_quota_format(struct quota_format_type *fmt) { struct quota_format_type **actqf; spin_lock(&dq_list_lock); for (actqf = &quota_formats; *actqf && *actqf != fmt; actqf = &(*actqf)->qf_next) ; if (*actqf) *actqf = (*actqf)->qf_next; spin_unlock(&dq_list_lock); } EXPORT_SYMBOL(unregister_quota_format); static struct quota_format_type *find_quota_format(int id) { struct quota_format_type *actqf; spin_lock(&dq_list_lock); for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next) ; if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; spin_unlock(&dq_list_lock); for (qm = 0; module_names[qm].qm_fmt_id && module_names[qm].qm_fmt_id != id; qm++) ; if (!module_names[qm].qm_fmt_id || request_module(module_names[qm].qm_mod_name)) return NULL; spin_lock(&dq_list_lock); for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next) ; if (actqf && !try_module_get(actqf->qf_owner)) actqf = NULL; } spin_unlock(&dq_list_lock); return actqf; } static void put_quota_format(struct quota_format_type *fmt) { module_put(fmt->qf_owner); } /* * Dquot List Management: * The quota code uses five lists for dquot management: the inuse_list, * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array. * A single dquot structure may be on some of those lists, depending on * its current state. * * All dquots are placed to the end of inuse_list when first created, and this * list is used for invalidate operation, which must look at every dquot. * * When the last reference of a dquot is dropped, the dquot is added to * releasing_dquots. We'll then queue work item which will call * synchronize_srcu() and after that perform the final cleanup of all the * dquots on the list. Each cleaned up dquot is moved to free_dquots list. * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot * struct. * * Unused and cleaned up dquots are in the free_dquots list and this list is * searched whenever we need an available dquot. Dquots are removed from the * list as soon as they are used again and dqstats.free_dquots gives the number * of dquots on the list. When dquot is invalidated it's completely released * from memory. * * Dirty dquots are added to the dqi_dirty_list of quota_info when mark * dirtied, and this list is searched when writing dirty dquots back to * quota file. Note that some filesystems do dirty dquot tracking on their * own (e.g. in a journal) and thus don't use dqi_dirty_list. * * Dquots with a specific identity (device, type and id) are placed on * one of the dquot_hash[] hash chains. The provides an efficient search * mechanism to locate a specific dquot. */ static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); static LIST_HEAD(releasing_dquots); static unsigned int dq_hash_bits, dq_hash_mask; static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); static qsize_t __inode_get_rsv_space(struct inode *inode); static int __dquot_initialize(struct inode *inode, int type); static void quota_release_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn); static inline unsigned int hashfn(const struct super_block *sb, struct kqid qid) { unsigned int id = from_kqid(&init_user_ns, qid); int type = qid.type; unsigned long tmp; tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type); return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask; } /* * Following list functions expect dq_list_lock to be held */ static inline void insert_dquot_hash(struct dquot *dquot) { struct hlist_head *head; head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id); hlist_add_head(&dquot->dq_hash, head); } static inline void remove_dquot_hash(struct dquot *dquot) { hlist_del_init(&dquot->dq_hash); } static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, struct kqid qid) { struct dquot *dquot; hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash) if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid)) return dquot; return NULL; } /* Add a dquot to the tail of the free list */ static inline void put_dquot_last(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &free_dquots); dqstats_inc(DQST_FREE_DQUOTS); } static inline void put_releasing_dquots(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &releasing_dquots); set_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void remove_free_dquot(struct dquot *dquot) { if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags)) dqstats_dec(DQST_FREE_DQUOTS); else clear_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void put_inuse(struct dquot *dquot) { /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ list_add_tail(&dquot->dq_inuse, &inuse_list); dqstats_inc(DQST_ALLOC_DQUOTS); } static inline void remove_inuse(struct dquot *dquot) { dqstats_dec(DQST_ALLOC_DQUOTS); list_del(&dquot->dq_inuse); } /* * End of list functions needing dq_list_lock */ static void wait_on_dquot(struct dquot *dquot) { mutex_lock(&dquot->dq_lock); mutex_unlock(&dquot->dq_lock); } static inline int dquot_active(struct dquot *dquot) { return test_bit(DQ_ACTIVE_B, &dquot->dq_flags); } static inline int dquot_dirty(struct dquot *dquot) { return test_bit(DQ_MOD_B, &dquot->dq_flags); } static inline int mark_dquot_dirty(struct dquot *dquot) { return dquot->dq_sb->dq_op->mark_dirty(dquot); } /* Mark dquot dirty in atomic manner, and return it's old dirty flag state */ int dquot_mark_dquot_dirty(struct dquot *dquot) { int ret = 1; if (!dquot_active(dquot)) return 0; if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags); /* If quota is dirty already, we don't have to acquire dq_list_lock */ if (dquot_dirty(dquot)) return 1; spin_lock(&dq_list_lock); if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> info[dquot->dq_id.type].dqi_dirty_list); ret = 0; } spin_unlock(&dq_list_lock); return ret; } EXPORT_SYMBOL(dquot_mark_dquot_dirty); /* Dirtify all the dquots - this can block when journalling */ static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots) { int ret, err, cnt; struct dquot *dquot; ret = err = 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) /* Even in case of error we have to continue */ ret = mark_dquot_dirty(dquot); if (!err && ret < 0) err = ret; } return err; } static inline void dqput_all(struct dquot **dquot) { unsigned int cnt; for (cnt = 0; cnt < MAXQUOTAS; cnt++) dqput(dquot[cnt]); } static inline int clear_dquot_dirty(struct dquot *dquot) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags); spin_lock(&dq_list_lock); if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); return 0; } list_del_init(&dquot->dq_dirty); spin_unlock(&dq_list_lock); return 1; } void mark_info_dirty(struct super_block *sb, int type) { spin_lock(&dq_data_lock); sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY; spin_unlock(&dq_data_lock); } EXPORT_SYMBOL(mark_info_dirty); /* * Read dquot from disk and alloc space for it */ int dquot_acquire(struct dquot *dquot) { int ret = 0, ret2 = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) goto out_iolock; } /* Make sure flags update is visible after dquot has been filled */ smp_mb__before_atomic(); set_bit(DQ_READ_B, &dquot->dq_flags); /* Instantiate dquot if needed */ if (!dquot_active(dquot) && !dquot->dq_off) { ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); /* Write the info if needed */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( dquot->dq_sb, dquot->dq_id.type); } if (ret < 0) goto out_iolock; if (ret2 < 0) { ret = ret2; goto out_iolock; } } /* * Make sure flags update is visible after on-disk struct has been * allocated. Paired with smp_rmb() in dqget(). */ smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_acquire); /* * Write dquot to disk */ int dquot_commit(struct dquot *dquot) { int ret = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); if (!clear_dquot_dirty(dquot)) goto out_lock; /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ if (dquot_active(dquot)) ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; out_lock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_commit); /* * Release dquot */ int dquot_release(struct dquot *dquot) { int ret = 0, ret2 = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); /* Check whether we are not racing with some other dqget() */ if (dquot_is_busy(dquot)) goto out_dqlock; if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); /* Write the info */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( dquot->dq_sb, dquot->dq_id.type); } if (ret >= 0) ret = ret2; } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_dqlock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_release); void dquot_destroy(struct dquot *dquot) { kmem_cache_free(dquot_cachep, dquot); } EXPORT_SYMBOL(dquot_destroy); static inline void do_destroy_dquot(struct dquot *dquot) { dquot->dq_sb->dq_op->destroy_dquot(dquot); } /* Invalidate all dquots on the list. Note that this function is called after * quota is disabled and pointers from inodes removed so there cannot be new * quota users. There can still be some users of quotas due to inodes being * just deleted or pruned by prune_icache() (those are not attached to any * list) or parallel quotactl call. We have to wait for such users. */ static void invalidate_dquots(struct super_block *sb, int type) { struct dquot *dquot, *tmp; restart: flush_delayed_work(&quota_release_work); spin_lock(&dq_list_lock); list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { if (dquot->dq_sb != sb) continue; if (dquot->dq_id.type != type) continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); /* * Once dqput() wakes us up, we know it's time to free * the dquot. * IMPORTANT: we rely on the fact that there is always * at most one process waiting for dquot to free. * Otherwise dq_count would be > 1 and we would never * wake up. */ wait_event(dquot_ref_wq, atomic_read(&dquot->dq_count) == 1); dqput(dquot); /* At this moment dquot() need not exist (it could be * reclaimed by prune_dqcache(). Hence we must * restart. */ goto restart; } /* * The last user already dropped its reference but dquot didn't * get fully cleaned up yet. Restart the scan which flushes the * work cleaning up released dquots. */ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); goto restart; } /* * Quota now has no users and it has been written on last * dqput() */ remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); } spin_unlock(&dq_list_lock); } /* Call callback for every active dquot on given filesystem */ int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv) { struct dquot *dquot, *old_dquot = NULL; int ret = 0; WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); spin_lock(&dq_list_lock); list_for_each_entry(dquot, &inuse_list, dq_inuse) { if (!dquot_active(dquot)) continue; if (dquot->dq_sb != sb) continue; /* Now we have active dquot so we can just increase use count */ atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); dqput(old_dquot); old_dquot = dquot; /* * ->release_dquot() can be racing with us. Our reference * protects us from new calls to it so just wait for any * outstanding call and recheck the DQ_ACTIVE_B after that. */ wait_on_dquot(dquot); if (dquot_active(dquot)) { ret = fn(dquot, priv); if (ret < 0) goto out; } spin_lock(&dq_list_lock); /* We are safe to continue now because our dquot could not * be moved out of the inuse list while we hold the reference */ } spin_unlock(&dq_list_lock); out: dqput(old_dquot); return ret; } EXPORT_SYMBOL(dquot_scan_active); static inline int dquot_write_dquot(struct dquot *dquot) { int ret = dquot->dq_sb->dq_op->write_dquot(dquot); if (ret < 0) { quota_error(dquot->dq_sb, "Can't write quota structure " "(error %d). Quota may get out of sync!", ret); /* Clear dirty bit anyway to avoid infinite loop. */ clear_dquot_dirty(dquot); } return ret; } /* Write all dquot structures to quota files */ int dquot_writeback_dquots(struct super_block *sb, int type) { struct list_head dirty; struct dquot *dquot; struct quota_info *dqopt = sb_dqopt(sb); int cnt; int err, ret = 0; WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); flush_delayed_work(&quota_release_work); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; spin_lock(&dq_list_lock); /* Move list away to avoid livelock. */ list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty); while (!list_empty(&dirty)) { dquot = list_first_entry(&dirty, struct dquot, dq_dirty); WARN_ON(!dquot_active(dquot)); /* If the dquot is releasing we should not touch it */ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); flush_delayed_work(&quota_release_work); spin_lock(&dq_list_lock); continue; } /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ dqgrab(dquot); spin_unlock(&dq_list_lock); err = dquot_write_dquot(dquot); if (err && !ret) ret = err; dqput(dquot); spin_lock(&dq_list_lock); } spin_unlock(&dq_list_lock); } for (cnt = 0; cnt < MAXQUOTAS; cnt++) if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) && info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); dqstats_inc(DQST_SYNCS); return ret; } EXPORT_SYMBOL(dquot_writeback_dquots); /* Write all dquot structures to disk and make them visible from userspace */ int dquot_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); int cnt; int ret; ret = dquot_writeback_dquots(sb, type); if (ret) return ret; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) return 0; /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ if (sb->s_op->sync_fs) { ret = sb->s_op->sync_fs(sb, 1); if (ret) return ret; } ret = sync_blockdev(sb->s_bdev); if (ret) return ret; /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } return 0; } EXPORT_SYMBOL(dquot_quota_sync); static unsigned long dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct dquot *dquot; unsigned long freed = 0; spin_lock(&dq_list_lock); while (!list_empty(&free_dquots) && sc->nr_to_scan) { dquot = list_first_entry(&free_dquots, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); sc->nr_to_scan--; freed++; } spin_unlock(&dq_list_lock); return freed; } static unsigned long dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio( percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } /* * Safely release dquot and put reference to dquot. */ static void quota_release_workfn(struct work_struct *work) { struct dquot *dquot; struct list_head rls_head; spin_lock(&dq_list_lock); /* Exchange the list head to avoid livelock. */ list_replace_init(&releasing_dquots, &rls_head); spin_unlock(&dq_list_lock); synchronize_srcu(&dquot_srcu); restart: spin_lock(&dq_list_lock); while (!list_empty(&rls_head)) { dquot = list_first_entry(&rls_head, struct dquot, dq_free); WARN_ON_ONCE(atomic_read(&dquot->dq_count)); /* * Note that DQ_RELEASING_B protects us from racing with * invalidate_dquots() calls so we are safe to work with the * dquot even after we drop dq_list_lock. */ if (dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ dquot_write_dquot(dquot); goto restart; } if (dquot_active(dquot)) { spin_unlock(&dq_list_lock); dquot->dq_sb->dq_op->release_dquot(dquot); goto restart; } /* Dquot is inactive and clean, now move it to free list */ remove_free_dquot(dquot); put_dquot_last(dquot); } spin_unlock(&dq_list_lock); } /* * Put reference to dquot */ void dqput(struct dquot *dquot) { if (!dquot) return; #ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", quotatypes[dquot->dq_id.type], from_kqid(&init_user_ns, dquot->dq_id)); BUG(); } #endif dqstats_inc(DQST_DROPS); spin_lock(&dq_list_lock); if (atomic_read(&dquot->dq_count) > 1) { /* We have more than one user... nothing to do */ atomic_dec(&dquot->dq_count); /* Releasing dquot during quotaoff phase? */ if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) && atomic_read(&dquot->dq_count) == 1) wake_up(&dquot_ref_wq); spin_unlock(&dq_list_lock); return; } /* Need to release dquot? */ WARN_ON_ONCE(!list_empty(&dquot->dq_free)); put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); queue_delayed_work(system_unbound_wq, &quota_release_work, 1); } EXPORT_SYMBOL(dqput); struct dquot *dquot_alloc(struct super_block *sb, int type) { return kmem_cache_zalloc(dquot_cachep, GFP_NOFS); } EXPORT_SYMBOL(dquot_alloc); static struct dquot *get_empty_dquot(struct super_block *sb, int type) { struct dquot *dquot; dquot = sb->dq_op->alloc_dquot(sb, type); if(!dquot) return NULL; mutex_init(&dquot->dq_lock); INIT_LIST_HEAD(&dquot->dq_free); INIT_LIST_HEAD(&dquot->dq_inuse); INIT_HLIST_NODE(&dquot->dq_hash); INIT_LIST_HEAD(&dquot->dq_dirty); dquot->dq_sb = sb; dquot->dq_id = make_kqid_invalid(type); atomic_set(&dquot->dq_count, 1); spin_lock_init(&dquot->dq_dqb_lock); return dquot; } /* * Get reference to dquot * * Locking is slightly tricky here. We are guarded from parallel quotaoff() * destroying our dquot by: * a) checking for quota flags under dq_list_lock and * b) getting a reference to dquot before we release dq_list_lock */ struct dquot *dqget(struct super_block *sb, struct kqid qid) { unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; if (!qid_has_mapping(sb->s_user_ns, qid)) return ERR_PTR(-EINVAL); if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: spin_lock(&dq_list_lock); spin_lock(&dq_state_lock); if (!sb_has_quota_active(sb, qid.type)) { spin_unlock(&dq_state_lock); spin_unlock(&dq_list_lock); dquot = ERR_PTR(-ESRCH); goto out; } spin_unlock(&dq_state_lock); dquot = find_dquot(hashent, sb, qid); if (!dquot) { if (!empty) { spin_unlock(&dq_list_lock); empty = get_empty_dquot(sb, qid.type); if (!empty) schedule(); /* Try to wait for a moment... */ goto we_slept; } dquot = empty; empty = NULL; dquot->dq_id = qid; /* all dquots go on the inuse_list */ put_inuse(dquot); /* hash it first so it can be found */ insert_dquot_hash(dquot); spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); } else { if (!atomic_read(&dquot->dq_count)) remove_free_dquot(dquot); atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); dqstats_inc(DQST_CACHE_HITS); dqstats_inc(DQST_LOOKUPS); } /* Wait for dq_lock - after this we know that either dquot_release() is * already finished or it will be canceled due to dq_count > 0 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ if (!dquot_active(dquot)) { int err; err = sb->dq_op->acquire_dquot(dquot); if (err < 0) { dqput(dquot); dquot = ERR_PTR(err); goto out; } } /* * Make sure following reads see filled structure - paired with * smp_mb__before_atomic() in dquot_acquire(). */ smp_rmb(); /* Has somebody invalidated entry under us? */ WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash)); out: if (empty) do_destroy_dquot(empty); return dquot; } EXPORT_SYMBOL(dqget); static inline struct dquot __rcu **i_dquot(struct inode *inode) { return inode->i_sb->s_op->get_dquots(inode); } static int dqinit_needed(struct inode *inode, int type) { struct dquot __rcu * const *dquots; int cnt; if (IS_NOQUOTA(inode)) return 0; dquots = i_dquot(inode); if (type != -1) return !dquots[type]; for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!dquots[cnt]) return 1; return 0; } /* This routine is guarded by s_umount semaphore */ static int add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif int err = 0; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || !atomic_read(&inode->i_writecount) || !dqinit_needed(inode, type)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif iput(old_inode); err = __dquot_initialize(inode, type); if (err) { iput(inode); goto out; } /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); out: #ifdef CONFIG_QUOTA_DEBUG if (reserved) { quota_error(sb, "Writes happened before quota was turned on " "thus quota information is probably inconsistent. " "Please run quotacheck(8)"); } #endif return err; } static void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch * only quota pointers and these have separate locking * (dq_data_lock). */ spin_lock(&dq_data_lock); if (!IS_NOQUOTA(inode)) { struct dquot __rcu **dquots = i_dquot(inode); struct dquot *dquot = srcu_dereference_check( dquots[type], &dquot_srcu, lockdep_is_held(&dq_data_lock)); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif rcu_assign_pointer(dquots[type], NULL); if (dquot) dqput(dquot); } spin_unlock(&dq_data_lock); } spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" " was disabled thus quota information is probably " "inconsistent. Please run quotacheck(8).\n", sb->s_id); } #endif } /* Gather all references from inodes and drop them */ static void drop_dquot_ref(struct super_block *sb, int type) { if (sb->dq_op) remove_dquot_ref(sb, type); } static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { if (dquot->dq_dqb.dqb_rsvspace >= number) dquot->dq_dqb.dqb_rsvspace -= number; else { WARN_ON_ONCE(1); dquot->dq_dqb.dqb_rsvspace = 0; } if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || dquot->dq_dqb.dqb_curinodes >= number) dquot->dq_dqb.dqb_curinodes -= number; else dquot->dq_dqb.dqb_curinodes = 0; if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) dquot->dq_dqb.dqb_itime = (time64_t) 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); } static void dquot_decr_space(struct dquot *dquot, qsize_t number) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || dquot->dq_dqb.dqb_curspace >= number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } struct dquot_warn { struct super_block *w_sb; struct kqid w_dq_id; short w_type; }; static int warning_issued(struct dquot *dquot, const int warntype) { int flag = (warntype == QUOTA_NL_BHARDWARN || warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B : ((warntype == QUOTA_NL_IHARDWARN || warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0); if (!flag) return 0; return test_and_set_bit(flag, &dquot->dq_flags); } #ifdef CONFIG_PRINT_QUOTA_WARNING static int flag_print_warnings = 1; static int need_print_warning(struct dquot_warn *warn) { if (!flag_print_warnings) return 0; switch (warn->w_dq_id.type) { case USRQUOTA: return uid_eq(current_fsuid(), warn->w_dq_id.uid); case GRPQUOTA: return in_group_p(warn->w_dq_id.gid); case PRJQUOTA: return 1; } return 0; } /* Print warning to user which exceeded quota */ static void print_warning(struct dquot_warn *warn) { char *msg = NULL; struct tty_struct *tty; int warntype = warn->w_type; if (warntype == QUOTA_NL_IHARDBELOW || warntype == QUOTA_NL_ISOFTBELOW || warntype == QUOTA_NL_BHARDBELOW || warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn)) return; tty = get_current_tty(); if (!tty) return; tty_write_message(tty, warn->w_sb->s_id); if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) tty_write_message(tty, ": warning, "); else tty_write_message(tty, ": write failed, "); tty_write_message(tty, quotatypes[warn->w_dq_id.type]); switch (warntype) { case QUOTA_NL_IHARDWARN: msg = " file limit reached.\r\n"; break; case QUOTA_NL_ISOFTLONGWARN: msg = " file quota exceeded too long.\r\n"; break; case QUOTA_NL_ISOFTWARN: msg = " file quota exceeded.\r\n"; break; case QUOTA_NL_BHARDWARN: msg = " block limit reached.\r\n"; break; case QUOTA_NL_BSOFTLONGWARN: msg = " block quota exceeded too long.\r\n"; break; case QUOTA_NL_BSOFTWARN: msg = " block quota exceeded.\r\n"; break; } tty_write_message(tty, msg); tty_kref_put(tty); } #endif static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, int warntype) { if (warning_issued(dquot, warntype)) return; warn->w_type = warntype; warn->w_sb = dquot->dq_sb; warn->w_dq_id = dquot->dq_id; } /* * Write warnings to the console and send warning messages over netlink. * * Note that this function can call into tty and networking code. */ static void flush_warnings(struct dquot_warn *warn) { int i; for (i = 0; i < MAXQUOTAS; i++) { if (warn[i].w_type == QUOTA_NL_NOWARN) continue; #ifdef CONFIG_PRINT_QUOTA_WARNING print_warning(&warn[i]); #endif quota_send_warning(warn[i].w_dq_id, warn[i].w_sb->s_dev, warn[i].w_type); } } static int ignore_hardlimit(struct dquot *dquot) { struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; return capable(CAP_SYS_RESOURCE) && (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & DQF_ROOT_SQUASH)); } static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes, struct dquot_warn *warn) { qsize_t newinodes; int ret = 0; spin_lock(&dquot->dq_dqb_lock); newinodes = dquot->dq_dqb.dqb_curinodes + inodes; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) goto add; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); ret = -EDQUOT; goto out; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); ret = -EDQUOT; goto out; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime == 0) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; } add: dquot->dq_dqb.dqb_curinodes = newinodes; out: spin_unlock(&dquot->dq_dqb_lock); return ret; } static int dquot_add_space(struct dquot *dquot, qsize_t space, qsize_t rsv_space, unsigned int flags, struct dquot_warn *warn) { qsize_t tspace; struct super_block *sb = dquot->dq_sb; int ret = 0; spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) goto finish; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space + rsv_space; if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); ret = -EDQUOT; goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); ret = -EDQUOT; goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { if (flags & DQUOT_SPACE_WARN) { prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; } else { /* * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ ret = -EDQUOT; goto finish; } } finish: /* * We have to be careful and go through warning generation & grace time * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it * only here... */ if (flags & DQUOT_SPACE_NOFAIL) ret = 0; if (!ret) { dquot->dq_dqb.dqb_rsvspace += rsv_space; dquot->dq_dqb.dqb_curspace += space; } spin_unlock(&dquot->dq_dqb_lock); return ret; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) { qsize_t newinodes; if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type)) return QUOTA_NL_NOWARN; newinodes = dquot->dq_dqb.dqb_curinodes - inodes; if (newinodes <= dquot->dq_dqb.dqb_isoftlimit) return QUOTA_NL_ISOFTBELOW; if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && newinodes < dquot->dq_dqb.dqb_ihardlimit) return QUOTA_NL_IHARDBELOW; return QUOTA_NL_NOWARN; } static int info_bdq_free(struct dquot *dquot, qsize_t space) { qsize_t tspace; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace; if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || tspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; if (tspace >= dquot->dq_dqb.dqb_bhardlimit && tspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } static int inode_quota_active(const struct inode *inode) { struct super_block *sb = inode->i_sb; if (IS_NOQUOTA(inode)) return 0; return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); } /* * Initialize quota pointers in inode * * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ static int __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; struct dquot __rcu **dquots; struct dquot *got[MAXQUOTAS] = {}; struct super_block *sb = inode->i_sb; qsize_t rsv; int ret = 0; if (!inode_quota_active(inode)) return 0; dquots = i_dquot(inode); /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { struct kqid qid; kprojid_t projid; int rc; struct dquot *dquot; if (type != -1 && cnt != type) continue; /* * The i_dquot should have been initialized in most cases, * we check it without locking here to avoid unnecessary * dqget()/dqput() calls. */ if (dquots[cnt]) continue; if (!sb_has_quota_active(sb, cnt)) continue; init_needed = 1; switch (cnt) { case USRQUOTA: qid = make_kqid_uid(inode->i_uid); break; case GRPQUOTA: qid = make_kqid_gid(inode->i_gid); break; case PRJQUOTA: rc = inode->i_sb->dq_op->get_projid(inode, &projid); if (rc) continue; qid = make_kqid_projid(projid); break; } dquot = dqget(sb, qid); if (IS_ERR(dquot)) { /* We raced with somebody turning quotas off... */ if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } got[cnt] = dquot; } /* All required i_dquot has been initialized */ if (!init_needed) return 0; spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) goto out_lock; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; /* Avoid races with quotaoff() */ if (!sb_has_quota_active(sb, cnt)) continue; /* We could race with quotaon or dqget() could have failed */ if (!got[cnt]) continue; if (!dquots[cnt]) { rcu_assign_pointer(dquots[cnt], got[cnt]); got[cnt] = NULL; /* * Make quota reservation system happy if someone * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); if (unlikely(rsv)) { struct dquot *dquot = srcu_dereference_check( dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); spin_lock(&inode->i_lock); /* Get reservation again under proper lock */ rsv = __inode_get_rsv_space(inode); spin_lock(&dquot->dq_dqb_lock); dquot->dq_dqb.dqb_rsvspace += rsv; spin_unlock(&dquot->dq_dqb_lock); spin_unlock(&inode->i_lock); } } } out_lock: spin_unlock(&dq_data_lock); out_put: /* Drop unused references */ dqput_all(got); return ret; } int dquot_initialize(struct inode *inode) { return __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); bool dquot_initialize_needed(struct inode *inode) { struct dquot __rcu **dquots; int i; if (!inode_quota_active(inode)) return false; dquots = i_dquot(inode); for (i = 0; i < MAXQUOTAS; i++) if (!dquots[i] && sb_has_quota_active(inode->i_sb, i)) return true; return false; } EXPORT_SYMBOL(dquot_initialize_needed); /* * Release all quotas referenced by inode. * * This function only be called on inode free or converting * a file to quota file, no other users for the i_dquot in * both cases, so we needn't call synchronize_srcu() after * clearing i_dquot. */ static void __dquot_drop(struct inode *inode) { int cnt; struct dquot __rcu **dquots = i_dquot(inode); struct dquot *put[MAXQUOTAS]; spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); rcu_assign_pointer(dquots[cnt], NULL); } spin_unlock(&dq_data_lock); dqput_all(put); } void dquot_drop(struct inode *inode) { struct dquot __rcu * const *dquots; int cnt; if (IS_NOQUOTA(inode)) return; /* * Test before calling to rule out calls from proc and such * where we are not allowed to block. Note that this is * actually reliable test even without the lock - the caller * must assure that nobody can come after the DQUOT_DROP and * add quota pointers back anyway. */ dquots = i_dquot(inode); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (dquots[cnt]) break; } if (cnt < MAXQUOTAS) __dquot_drop(inode); } EXPORT_SYMBOL(dquot_drop); /* * inode_reserved_space is managed internally by quota, and protected by * i_lock similar to i_blocks+i_bytes. */ static qsize_t *inode_reserved_space(struct inode * inode) { /* Filesystem must explicitly define it's own method in order to use * quota reservation interface */ BUG_ON(!inode->i_sb->dq_op->get_reserved_space); return inode->i_sb->dq_op->get_reserved_space(inode); } static qsize_t __inode_get_rsv_space(struct inode *inode) { if (!inode->i_sb->dq_op->get_reserved_space) return 0; return *inode_reserved_space(inode); } static qsize_t inode_get_rsv_space(struct inode *inode) { qsize_t ret; if (!inode->i_sb->dq_op->get_reserved_space) return 0; spin_lock(&inode->i_lock); ret = __inode_get_rsv_space(inode); spin_unlock(&inode->i_lock); return ret; } /* * This functions updates i_blocks+i_bytes fields and quota information * (together with appropriate checks). * * NOTE: We absolutely rely on the fact that caller dirties the inode * (usually helpers in quotaops.h care about this) and holds a handle for * the current transaction so that dquot write and inode write go into the * same transaction. */ /* * This operation can block, but only after everything is updated */ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; int reserve = flags & DQUOT_SPACE_RESERVE; struct dquot __rcu **dquots; struct dquot *dquot; if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; spin_unlock(&inode->i_lock); } else { inode_add_bytes(inode, number); } goto out; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; if (reserve) { ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]); } else { ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]); } if (ret) { /* Back out changes we already did */ for (cnt--; cnt >= 0; cnt--) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); if (reserve) dquot_free_reserved_space(dquot, number); else dquot_decr_space(dquot, number); spin_unlock(&dquot->dq_dqb_lock); } spin_unlock(&inode->i_lock); goto out_flush_warn; } } if (reserve) *inode_reserved_space(inode) += number; else __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); if (reserve) goto out_flush_warn; ret = mark_all_dquot_dirty(dquots); out_flush_warn: srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); out: return ret; } EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ int dquot_alloc_inode(struct inode *inode) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu * const *dquots; struct dquot *dquot; if (!inode_quota_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; ret = dquot_add_inodes(dquot, 1, &warn[cnt]); if (ret) { for (cnt--; cnt >= 0; cnt--) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; /* Back out changes we already did */ spin_lock(&dquot->dq_dqb_lock); dquot_decr_inodes(dquot, 1); spin_unlock(&dquot->dq_dqb_lock); } goto warn_put_all; } } warn_put_all: spin_unlock(&inode->i_lock); if (ret == 0) ret = mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); return ret; } EXPORT_SYMBOL(dquot_alloc_inode); /* * Convert in-memory reserved quotas to real consumed quotas */ void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot __rcu **dquots; struct dquot *dquot; int cnt, index; if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) { spin_lock(&dquot->dq_dqb_lock); if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number)) number = dquot->dq_dqb.dqb_rsvspace; dquot->dq_dqb.dqb_curspace += number; dquot->dq_dqb.dqb_rsvspace -= number; spin_unlock(&dquot->dq_dqb_lock); } } /* Update inode bytes */ *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); } EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * Convert allocated space back to in-memory reserved quotas */ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot __rcu **dquots; struct dquot *dquot; int cnt, index; if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) { spin_lock(&dquot->dq_dqb_lock); if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) number = dquot->dq_dqb.dqb_curspace; dquot->dq_dqb.dqb_rsvspace += number; dquot->dq_dqb.dqb_curspace -= number; spin_unlock(&dquot->dq_dqb_lock); } } /* Update inode bytes */ *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); } EXPORT_SYMBOL(dquot_reclaim_space_nodirty); /* * This operation can block, but only after everything is updated */ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu **dquots; struct dquot *dquot; int reserve = flags & DQUOT_SPACE_RESERVE, index; if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; spin_unlock(&inode->i_lock); } else { inode_sub_bytes(inode, number); } return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); wtype = info_bdq_free(dquot, number); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquot, wtype); if (reserve) dquot_free_reserved_space(dquot, number); else dquot_decr_space(dquot, number); spin_unlock(&dquot->dq_dqb_lock); } if (reserve) *inode_reserved_space(inode) -= number; else __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); if (reserve) goto out_unlock; mark_all_dquot_dirty(dquots); out_unlock: srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ void dquot_free_inode(struct inode *inode) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu * const *dquots; struct dquot *dquot; int index; if (!inode_quota_active(inode)) return; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); wtype = info_idq_free(dquot, 1); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquot, wtype); dquot_decr_inodes(dquot, 1); spin_unlock(&dquot->dq_dqb_lock); } spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); /* * Transfer the number of inode and blocks from one diskquota to an other. * On success, dquot references in transfer_to are consumed and references * to original dquots that need to be released are placed there. On failure, * references are kept untouched. * * This operation can block, but only after everything is updated * A transaction must be started when entering this function. * * We are holding reference on transfer_from & transfer_to, no need to * protect them by srcu_read_lock(). */ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { qsize_t cur_space; qsize_t rsv_space = 0; qsize_t inode_usage = 1; struct dquot __rcu **dquots; struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, index, ret = 0, err; char is_valid[MAXQUOTAS] = {}; struct dquot_warn warn_to[MAXQUOTAS]; struct dquot_warn warn_from_inodes[MAXQUOTAS]; struct dquot_warn warn_from_space[MAXQUOTAS]; if (IS_NOQUOTA(inode)) return 0; if (inode->i_sb->dq_op->get_inode_usage) { ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage); if (ret) return ret; } /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { warn_to[cnt].w_type = QUOTA_NL_NOWARN; warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; } spin_lock(&dq_data_lock); spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); return 0; } cur_space = __inode_get_bytes(inode); rsv_space = __inode_get_rsv_space(inode); dquots = i_dquot(inode); /* * Build the transfer_from list, check limits, and update usage in * the target structures. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { /* * Skip changes for same uid or gid or for turned off quota-type. */ if (!transfer_to[cnt]) continue; /* Avoid races with quotaoff() */ if (!sb_has_quota_active(inode->i_sb, cnt)) continue; is_valid[cnt] = 1; transfer_from[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); ret = dquot_add_inodes(transfer_to[cnt], inode_usage, &warn_to[cnt]); if (ret) goto over_quota; ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, DQUOT_SPACE_WARN, &warn_to[cnt]); if (ret) { spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); spin_unlock(&transfer_to[cnt]->dq_dqb_lock); goto over_quota; } } /* Decrease usage for source structures and update quota pointers */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!is_valid[cnt]) continue; /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; spin_lock(&transfer_from[cnt]->dq_dqb_lock); wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); wtype = info_bdq_free(transfer_from[cnt], cur_space + rsv_space); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); dquot_decr_inodes(transfer_from[cnt], inode_usage); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); spin_unlock(&transfer_from[cnt]->dq_dqb_lock); } rcu_assign_pointer(dquots[cnt], transfer_to[cnt]); } spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); /* * These arrays are local and we hold dquot references so we don't need * the srcu protection but still take dquot_srcu to avoid warning in * mark_all_dquot_dirty(). */ index = srcu_read_lock(&dquot_srcu); err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from); if (err < 0) ret = err; err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to); if (err < 0) ret = err; srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn_to); flush_warnings(warn_from_inodes); flush_warnings(warn_from_space); /* Pass back references to put */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (is_valid[cnt]) transfer_to[cnt] = transfer_from[cnt]; return ret; over_quota: /* Back out changes we already did */ for (cnt--; cnt >= 0; cnt--) { if (!is_valid[cnt]) continue; spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); dquot_decr_space(transfer_to[cnt], cur_space); dquot_free_reserved_space(transfer_to[cnt], rsv_space); spin_unlock(&transfer_to[cnt]->dq_dqb_lock); } spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); flush_warnings(warn_to); return ret; } EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; struct super_block *sb = inode->i_sb; int ret; if (!inode_quota_active(inode)) return 0; if (i_uid_needs_update(idmap, iattr, inode)) { kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } transfer_to[USRQUOTA] = dquot; } if (i_gid_needs_update(idmap, iattr, inode)) { kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } transfer_to[GRPQUOTA] = dquot; } ret = __dquot_transfer(inode, transfer_to); out_put: dqput_all(transfer_to); return ret; } EXPORT_SYMBOL(dquot_transfer); /* * Write info of quota file to disk */ int dquot_commit_info(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); return dqopt->ops[type]->write_file_info(sb, type); } EXPORT_SYMBOL(dquot_commit_info); int dquot_get_next_id(struct super_block *sb, struct kqid *qid) { struct quota_info *dqopt = sb_dqopt(sb); if (!sb_has_quota_active(sb, qid->type)) return -ESRCH; if (!dqopt->ops[qid->type]->get_next_id) return -ENOSYS; return dqopt->ops[qid->type]->get_next_id(sb, qid); } EXPORT_SYMBOL(dquot_get_next_id); /* * Definitions of diskquota operations. */ const struct dquot_operations dquot_operations = { .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, .mark_dirty = dquot_mark_dquot_dirty, .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_next_id = dquot_get_next_id, }; EXPORT_SYMBOL(dquot_operations); /* * Generic helper for ->open on filesystems supporting disk quotas. */ int dquot_file_open(struct inode *inode, struct file *file) { int error; error = generic_file_open(inode, file); if (!error && (file->f_mode & FMODE_WRITE)) error = dquot_initialize(inode); return error; } EXPORT_SYMBOL(dquot_file_open); static void vfs_cleanup_quota_inode(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); struct inode *inode = dqopt->files[type]; if (!inode) return; if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { inode_lock(inode); inode->i_flags &= ~S_NOQUOTA; inode_unlock(inode); } dqopt->files[type] = NULL; iput(inode); } /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int dquot_disable(struct super_block *sb, int type, unsigned int flags) { int cnt; struct quota_info *dqopt = sb_dqopt(sb); rwsem_assert_held_write(&sb->s_umount); /* Cannot turn off usage accounting without turning off limits, or * suspend quotas and simultaneously turn quotas off. */ if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED | DQUOT_USAGE_ENABLED))) return -EINVAL; /* * Skip everything if there's nothing to do. We have to do this because * sometimes we are called when fill_super() failed and calling * sync_fs() in such cases does no good. */ if (!sb_any_quota_loaded(sb)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_loaded(sb, cnt)) continue; if (flags & DQUOT_SUSPENDED) { spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); } else { spin_lock(&dq_state_lock); dqopt->flags &= ~dquot_state_flag(flags, cnt); /* Turning off suspended quotas? */ if (!sb_has_quota_loaded(sb, cnt) && sb_has_quota_suspended(sb, cnt)) { dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); vfs_cleanup_quota_inode(sb, cnt); continue; } spin_unlock(&dq_state_lock); } /* We still have to keep quota loaded? */ if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED)) continue; /* Note: these are blocking operations */ drop_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); /* * Now all dquots should be invalidated, all writes done so we * should be only users of the info. No locks needed. */ if (info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); put_quota_format(dqopt->info[cnt].dqi_format); dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; dqopt->ops[cnt] = NULL; } /* Skip syncing and setting flags if quota files are hidden */ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) goto put_inodes; /* Sync the superblock so that buffers with quota data are written to * disk (and so userspace sees correct data afterwards). */ if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 1); sync_blockdev(sb->s_bdev); /* Now the quota files are just ordinary files and we can set the * inode flags back. Moreover we discard the pagecache so that * userspace sees the writes we did bypassing the pagecache. We * must also discard the blockdev buffers so that we see the * changes done by userspace on the next quotaon() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) { inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); put_inodes: /* We are done when suspending quotas */ if (flags & DQUOT_SUSPENDED) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!sb_has_quota_loaded(sb, cnt)) vfs_cleanup_quota_inode(sb, cnt); return 0; } EXPORT_SYMBOL(dquot_disable); int dquot_quota_off(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); } EXPORT_SYMBOL(dquot_quota_off); /* * Turn quotas on on a device */ static int vfs_setup_quota_inode(struct inode *inode, int type) { struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); if (is_bad_inode(inode)) return -EUCLEAN; if (!S_ISREG(inode->i_mode)) return -EACCES; if (IS_RDONLY(inode)) return -EROFS; if (sb_has_quota_loaded(sb, type)) return -EBUSY; /* * Quota files should never be encrypted. They should be thought of as * filesystem metadata, not user data. New-style internal quota files * cannot be encrypted by users anyway, but old-style external quota * files could potentially be incorrectly created in an encrypted * directory, hence this explicit check. Some reasons why encrypted * quota files don't work include: (1) some filesystems that support * encryption don't handle it in their quota_read and quota_write, and * (2) cleaning up encrypted quota files at unmount would need special * consideration, as quota files are cleaned up later than user files. */ if (IS_ENCRYPTED(inode)) return -EINVAL; dqopt->files[type] = igrab(inode); if (!dqopt->files[type]) return -EIO; if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ inode_lock(inode); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added */ __dquot_drop(inode); } return 0; } int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags) { struct quota_format_type *fmt; struct quota_info *dqopt = sb_dqopt(sb); int error; lockdep_assert_held_write(&sb->s_umount); /* Just unsuspend quotas? */ if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED)) return -EINVAL; fmt = find_quota_format(format_id); if (!fmt) return -ESRCH; if (!sb->dq_op || !sb->s_qcop || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; } /* Filesystems outside of init_user_ns not yet supported */ if (sb->s_user_ns != &init_user_ns) { error = -EINVAL; goto out_fmt; } /* Usage always has to be set... */ if (!(flags & DQUOT_USAGE_ENABLED)) { error = -EINVAL; goto out_fmt; } if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; goto out_fmt; } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* As we bypass the pagecache we must now flush all the * dirty data and invalidate caches so that kernel sees * changes from userspace. It is not enough to just flush * the quota file since if blocksize < pagesize, invalidation * of the cache could fail because of other unrelated dirty * data */ sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } error = -EINVAL; if (!fmt->qf_ops->check_quota_file(sb, type)) goto out_fmt; dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; dqopt->info[type].dqi_fmt_id = format_id; INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); error = dqopt->ops[type]->read_file_info(sb, type); if (error < 0) goto out_fmt; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; spin_unlock(&dq_data_lock); } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); error = add_dquot_ref(sb, type); if (error) dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; out_fmt: put_quota_format(fmt); return error; } EXPORT_SYMBOL(dquot_load_quota_sb); /* * More powerful function for turning on quotas on given quota inode allowing * setting of individual quota flags */ int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags) { int err; err = vfs_setup_quota_inode(inode, type); if (err < 0) return err; err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags); if (err < 0) vfs_cleanup_quota_inode(inode->i_sb, type); return err; } EXPORT_SYMBOL(dquot_load_quota_inode); /* Reenable quotas on remount RW */ int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); int ret = 0, cnt; unsigned int flags; rwsem_assert_held_write(&sb->s_umount); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_suspended(sb, cnt)) continue; spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, cnt); dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); spin_unlock(&dq_state_lock); flags = dquot_generic_flag(flags, cnt); ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id, flags); if (ret < 0) vfs_cleanup_quota_inode(sb, cnt); } return ret; } EXPORT_SYMBOL(dquot_resume); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int error = security_quota_on(path->dentry); if (error) return error; /* Quota file not on the same filesystem? */ if (path->dentry->d_sb != sb) error = -EXDEV; else error = dquot_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; } EXPORT_SYMBOL(dquot_quota_on); /* * This function is used when filesystem needs to initialize quotas * during mount time. */ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type) { struct dentry *dentry; int error; dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name)); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_quota_on(dentry); if (!error) error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); dput(dentry); return error; } EXPORT_SYMBOL(dquot_quota_on_mount); static int dquot_quota_enable(struct super_block *sb, unsigned int flags) { int ret; int type; struct quota_info *dqopt = sb_dqopt(sb); if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return -ENOSYS; /* Accounting cannot be turned on while fs is mounted */ flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT); if (!flags) return -EINVAL; for (type = 0; type < MAXQUOTAS; type++) { if (!(flags & qtype_enforce_flag(type))) continue; /* Can't enforce without accounting */ if (!sb_has_quota_usage_enabled(sb, type)) { ret = -EINVAL; goto out_err; } if (sb_has_quota_limits_enabled(sb, type)) { /* compatible with XFS */ ret = -EEXIST; goto out_err; } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); spin_unlock(&dq_state_lock); } return 0; out_err: /* Backout enforcement enablement we already did */ for (type--; type >= 0; type--) { if (flags & qtype_enforce_flag(type)) dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); } return ret; } static int dquot_quota_disable(struct super_block *sb, unsigned int flags) { int ret; int type; struct quota_info *dqopt = sb_dqopt(sb); if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return -ENOSYS; /* * We don't support turning off accounting via quotactl. In principle * quota infrastructure can do this but filesystems don't expect * userspace to be able to do it. */ if (flags & (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT)) return -EOPNOTSUPP; /* Filter out limits not enabled */ for (type = 0; type < MAXQUOTAS; type++) if (!sb_has_quota_limits_enabled(sb, type)) flags &= ~qtype_enforce_flag(type); /* Nothing left? */ if (!flags) return -EEXIST; for (type = 0; type < MAXQUOTAS; type++) { if (flags & qtype_enforce_flag(type)) { ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); if (ret < 0) goto out_err; } } return 0; out_err: /* Backout enforcement disabling we already did */ for (type--; type >= 0; type--) { if (flags & qtype_enforce_flag(type)) { spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); spin_unlock(&dq_state_lock); } } return ret; } /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; memset(di, 0, sizeof(*di)); spin_lock(&dquot->dq_dqb_lock); di->d_spc_hardlimit = dm->dqb_bhardlimit; di->d_spc_softlimit = dm->dqb_bsoftlimit; di->d_ino_hardlimit = dm->dqb_ihardlimit; di->d_ino_softlimit = dm->dqb_isoftlimit; di->d_space = dm->dqb_curspace + dm->dqb_rsvspace; di->d_ino_count = dm->dqb_curinodes; di->d_spc_timer = dm->dqb_btime; di->d_ino_timer = dm->dqb_itime; spin_unlock(&dquot->dq_dqb_lock); } int dquot_get_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *di) { struct dquot *dquot; dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); return 0; } EXPORT_SYMBOL(dquot_get_dqblk); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid, struct qc_dqblk *di) { struct dquot *dquot; int err; if (!sb->dq_op->get_next_id) return -ENOSYS; err = sb->dq_op->get_next_id(sb, qid); if (err < 0) return err; dquot = dqget(sb, *qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); return 0; } EXPORT_SYMBOL(dquot_get_next_dqblk); #define VFS_QC_MASK \ (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \ QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \ QC_SPC_TIMER | QC_INO_TIMER) /* Generic routine for setting common part of quota structure */ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; int ret; if (di->d_fieldmask & ~VFS_QC_MASK) return -EINVAL; if (((di->d_fieldmask & QC_SPC_SOFT) && di->d_spc_softlimit > dqi->dqi_max_spc_limit) || ((di->d_fieldmask & QC_SPC_HARD) && di->d_spc_hardlimit > dqi->dqi_max_spc_limit) || ((di->d_fieldmask & QC_INO_SOFT) && (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) || ((di->d_fieldmask & QC_INO_HARD) && (di->d_ino_hardlimit > dqi->dqi_max_ino_limit))) return -ERANGE; spin_lock(&dquot->dq_dqb_lock); if (di->d_fieldmask & QC_SPACE) { dm->dqb_curspace = di->d_space - dm->dqb_rsvspace; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_SPC_SOFT) dm->dqb_bsoftlimit = di->d_spc_softlimit; if (di->d_fieldmask & QC_SPC_HARD) dm->dqb_bhardlimit = di->d_spc_hardlimit; if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) { check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_COUNT) { dm->dqb_curinodes = di->d_ino_count; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_SOFT) dm->dqb_isoftlimit = di->d_ino_softlimit; if (di->d_fieldmask & QC_INO_HARD) dm->dqb_ihardlimit = di->d_ino_hardlimit; if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) { check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_SPC_TIMER) { dm->dqb_btime = di->d_spc_timer; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_TIMER) { dm->dqb_itime = di->d_ino_timer; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); } if (check_blim) { if (!dm->dqb_bsoftlimit || dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_SPC_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace; } if (check_ilim) { if (!dm->dqb_isoftlimit || dm->dqb_curinodes <= dm->dqb_isoftlimit) { dm->dqb_itime = 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_INO_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace; } if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) clear_bit(DQ_FAKE_B, &dquot->dq_flags); else set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); ret = mark_dquot_dirty(dquot); if (ret < 0) return ret; return 0; } int dquot_set_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *di) { struct dquot *dquot; int rc; dquot = dqget(sb, qid); if (IS_ERR(dquot)) { rc = PTR_ERR(dquot); goto out; } rc = do_set_dqblk(dquot, di); dqput(dquot); out: return rc; } EXPORT_SYMBOL(dquot_set_dqblk); /* Generic routine for getting common part of quota file information */ int dquot_get_state(struct super_block *sb, struct qc_state *state) { struct mem_dqinfo *mi; struct qc_type_state *tstate; struct quota_info *dqopt = sb_dqopt(sb); int type; memset(state, 0, sizeof(*state)); for (type = 0; type < MAXQUOTAS; type++) { if (!sb_has_quota_active(sb, type)) continue; tstate = state->s_state + type; mi = sb_dqopt(sb)->info + type; tstate->flags = QCI_ACCT_ENABLED; spin_lock(&dq_data_lock); if (mi->dqi_flags & DQF_SYS_FILE) tstate->flags |= QCI_SYSFILE; if (mi->dqi_flags & DQF_ROOT_SQUASH) tstate->flags |= QCI_ROOT_SQUASH; if (sb_has_quota_limits_enabled(sb, type)) tstate->flags |= QCI_LIMITS_ENFORCED; tstate->spc_timelimit = mi->dqi_bgrace; tstate->ino_timelimit = mi->dqi_igrace; if (dqopt->files[type]) { tstate->ino = dqopt->files[type]->i_ino; tstate->blocks = dqopt->files[type]->i_blocks; } tstate->nextents = 1; /* We don't know... */ spin_unlock(&dq_data_lock); } return 0; } EXPORT_SYMBOL(dquot_get_state); /* Generic routine for setting common part of quota file information */ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) { struct mem_dqinfo *mi; if ((ii->i_fieldmask & QC_WARNS_MASK) || (ii->i_fieldmask & QC_RT_SPC_TIMER)) return -EINVAL; if (!sb_has_quota_active(sb, type)) return -ESRCH; mi = sb_dqopt(sb)->info + type; if (ii->i_fieldmask & QC_FLAGS) { if ((ii->i_flags & QCI_ROOT_SQUASH && mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) return -EINVAL; } spin_lock(&dq_data_lock); if (ii->i_fieldmask & QC_SPC_TIMER) mi->dqi_bgrace = ii->i_spc_timelimit; if (ii->i_fieldmask & QC_INO_TIMER) mi->dqi_igrace = ii->i_ino_timelimit; if (ii->i_fieldmask & QC_FLAGS) { if (ii->i_flags & QCI_ROOT_SQUASH) mi->dqi_flags |= DQF_ROOT_SQUASH; else mi->dqi_flags &= ~DQF_ROOT_SQUASH; } spin_unlock(&dq_data_lock); mark_info_dirty(sb, type); /* Force write to disk */ return sb->dq_op->write_info(sb, type); } EXPORT_SYMBOL(dquot_set_dqinfo); const struct quotactl_ops dquot_quotactl_sysfile_ops = { .quota_enable = dquot_quota_enable, .quota_disable = dquot_quota_disable, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .get_nextdqblk = dquot_get_next_dqblk, .set_dqblk = dquot_set_dqblk }; EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); /* Filter negative values for non-monotonic counters */ if (value < 0 && (type == DQST_ALLOC_DQUOTS || type == DQST_FREE_DQUOTS)) value = 0; /* Update global table */ dqstats.stat[type] = value; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "drops", .data = &dqstats.stat[DQST_DROPS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "reads", .data = &dqstats.stat[DQST_READS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "writes", .data = &dqstats.stat[DQST_WRITES], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "cache_hits", .data = &dqstats.stat[DQST_CACHE_HITS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "allocated_dquots", .data = &dqstats.stat[DQST_ALLOC_DQUOTS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "free_dquots", .data = &dqstats.stat[DQST_FREE_DQUOTS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "syncs", .data = &dqstats.stat[DQST_SYNCS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, #ifdef CONFIG_PRINT_QUOTA_WARNING { .procname = "warnings", .data = &flag_print_warnings, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; static int __init dquot_init(void) { int i, ret; unsigned long nr_hash, order; struct shrinker *dqcache_shrinker; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); register_sysctl_init("fs/quota", fs_dqstats_table); dquot_cachep = kmem_cache_create("dquot", sizeof(struct dquot), sizeof(unsigned long) * 4, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_PANIC), NULL); order = 0; dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order); if (!dquot_hash) panic("Cannot create dquot hash table"); ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL, _DQST_DQSTAT_LAST); if (ret) panic("Cannot create dquot stat counters"); /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); dq_hash_bits = ilog2(nr_hash); nr_hash = 1UL << dq_hash_bits; dq_hash_mask = nr_hash - 1; for (i = 0; i < nr_hash; i++) INIT_HLIST_HEAD(dquot_hash + i); pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); dqcache_shrinker = shrinker_alloc(0, "dquota-cache"); if (!dqcache_shrinker) panic("Cannot allocate dquot shrinker"); dqcache_shrinker->count_objects = dqcache_shrink_count; dqcache_shrinker->scan_objects = dqcache_shrink_scan; shrinker_register(dqcache_shrinker); return 0; } fs_initcall(dquot_init);
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 /* SPDX-License-Identifier: GPL-2.0 */ /* * Statically sized hash table implementation * (C) 2012 Sasha Levin <levinsasha928@gmail.com> */ #ifndef _LINUX_HASHTABLE_H #define _LINUX_HASHTABLE_H #include <linux/list.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/rculist.h> #define DEFINE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] __read_mostly = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] #define HASH_SIZE(name) (ARRAY_SIZE(name)) #define HASH_BITS(name) ilog2(HASH_SIZE(name)) /* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ #define hash_min(val, bits) \ (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) static inline void __hash_init(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) INIT_HLIST_HEAD(&ht[i]); } /** * hash_init - initialize a hash table * @hashtable: hashtable to be initialized * * Calculates the size of the hashtable from the given parameter, otherwise * same as hash_init_size. * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) /** * hash_add - add an object to a hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add(hashtable, node, key) \ hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_add_rcu - add an object to a rcu enabled hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add_rcu(hashtable, node, key) \ hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_hashed - check whether an object is in any hashtable * @node: the &struct hlist_node of the object to be checked */ static inline bool hash_hashed(struct hlist_node *node) { return !hlist_unhashed(node); } static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) if (!hlist_empty(&ht[i])) return false; return true; } /** * hash_empty - check whether a hashtable is empty * @hashtable: hashtable to check * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) /** * hash_del - remove an object from a hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del(struct hlist_node *node) { hlist_del_init(node); } /** * hash_del_rcu - remove an object from a rcu enabled hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del_rcu(struct hlist_node *node) { hlist_del_init_rcu(node); } /** * hash_for_each - iterate over a hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry(obj, &name[bkt], member) /** * hash_for_each_rcu - iterate over a rcu enabled hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_rcu(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_rcu(obj, &name[bkt], member) /** * hash_for_each_safe - iterate over a hashtable safe against removal of * hash entry * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @tmp: a &struct hlist_node used for temporary storage * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_safe(name, bkt, tmp, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) /** * hash_for_each_possible - iterate over all possible objects hashing to the * same bucket * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_rcu - iterate over all possible objects hashing to the * same bucket in an rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_rcu(name, obj, member, key, cond...) \ hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ member, ## cond) /** * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over * * This is the same as hash_for_each_possible_rcu() except that it does * not do any RCU debugging or tracing. */ #define hash_for_each_possible_rcu_notrace(name, obj, member, key) \ hlist_for_each_entry_rcu_notrace(obj, \ &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @tmp: a &struct hlist_node used for temporary storage * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_safe(name, obj, tmp, member, key) \ hlist_for_each_entry_safe(obj, tmp,\ &name[hash_min(key, HASH_BITS(name))], member) #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 /* SPDX-License-Identifier: GPL-2.0 */ /* * Common values and helper functions for the ChaCha and XChaCha stream ciphers. * * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's * security. Here they share the same key size, tfm context, and setkey * function; only their IV size and encrypt/decrypt function differ. * * The ChaCha paper specifies 20, 12, and 8-round variants. In general, it is * recommended to use the 20-round variant ChaCha20. However, the other * variants can be needed in some performance-sensitive scenarios. The generic * ChaCha code currently allows only the 20 and 12-round variants. */ #ifndef _CRYPTO_CHACHA_H #define _CRYPTO_CHACHA_H #include <linux/unaligned.h> #include <linux/types.h> /* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */ #define CHACHA_IV_SIZE 16 #define CHACHA_KEY_SIZE 32 #define CHACHA_BLOCK_SIZE 64 #define CHACHAPOLY_IV_SIZE 12 #define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32)) /* 192-bit nonce, then 64-bit stream position */ #define XCHACHA_IV_SIZE 32 void chacha_block_generic(u32 *state, u8 *stream, int nrounds); static inline void chacha20_block(u32 *state, u8 *stream) { chacha_block_generic(state, stream, 20); } void hchacha_block_arch(const u32 *state, u32 *out, int nrounds); void hchacha_block_generic(const u32 *state, u32 *out, int nrounds); static inline void hchacha_block(const u32 *state, u32 *out, int nrounds) { if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) hchacha_block_arch(state, out, nrounds); else hchacha_block_generic(state, out, nrounds); } enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, CHACHA_CONSTANT_ND_3 = 0x3320646eU, CHACHA_CONSTANT_2_BY = 0x79622d32U, CHACHA_CONSTANT_TE_K = 0x6b206574U }; static inline void chacha_init_consts(u32 *state) { state[0] = CHACHA_CONSTANT_EXPA; state[1] = CHACHA_CONSTANT_ND_3; state[2] = CHACHA_CONSTANT_2_BY; state[3] = CHACHA_CONSTANT_TE_K; } static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv) { chacha_init_consts(state); state[4] = key[0]; state[5] = key[1]; state[6] = key[2]; state[7] = key[3]; state[8] = key[4]; state[9] = key[5]; state[10] = key[6]; state[11] = key[7]; state[12] = get_unaligned_le32(iv + 0); state[13] = get_unaligned_le32(iv + 4); state[14] = get_unaligned_le32(iv + 8); state[15] = get_unaligned_le32(iv + 12); } void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds); static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA)) chacha_crypt_arch(state, dst, src, bytes, nrounds); else chacha_crypt_generic(state, dst, src, bytes, nrounds); } static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { chacha_crypt(state, dst, src, bytes, 20); } #endif /* _CRYPTO_CHACHA_H */
2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* * llc_station.c - station component of LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_conn.h> #include <net/llc_c_ac.h> #include <net/llc_s_ac.h> #include <net/llc_c_ev.h> #include <net/llc_c_st.h> #include <net/llc_s_ev.h> #include <net/llc_s_st.h> #include <net/llc_pdu.h> static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID && !pdu->dsap; /* NULL DSAP value */ } static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && /* command PDU */ LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST && !pdu->dsap; /* NULL DSAP */ } static int llc_station_ac_send_xid_r(struct sk_buff *skb) { u8 mac_da[ETH_ALEN], dsap; int rc = 1; struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, sizeof(struct llc_xid_info)); if (!nskb) goto out; llc_pdu_decode_sa(skb, mac_da); llc_pdu_decode_ssap(skb, &dsap); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127); rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da); if (unlikely(rc)) goto free; dev_queue_xmit(nskb); out: return rc; free: kfree_skb(nskb); goto out; } static int llc_station_ac_send_test_r(struct sk_buff *skb) { u8 mac_da[ETH_ALEN], dsap; int rc = 1; u32 data_size; struct sk_buff *nskb; if (skb->mac_len < ETH_HLEN) goto out; /* The test request command is type U (llc_len = 3) */ data_size = ntohs(eth_hdr(skb)->h_proto) - 3; nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size); if (!nskb) goto out; llc_pdu_decode_sa(skb, mac_da); llc_pdu_decode_ssap(skb, &dsap); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); llc_pdu_init_as_test_rsp(nskb, skb); rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da); if (unlikely(rc)) goto free; dev_queue_xmit(nskb); out: return rc; free: kfree_skb(nskb); goto out; } /** * llc_station_rcv - send received pdu to the station state machine * @skb: received frame. * * Sends data unit to station state machine. */ static void llc_station_rcv(struct sk_buff *skb) { if (llc_stat_ev_rx_null_dsap_xid_c(skb)) llc_station_ac_send_xid_r(skb); else if (llc_stat_ev_rx_null_dsap_test_c(skb)) llc_station_ac_send_test_r(skb); kfree_skb(skb); } void __init llc_station_init(void) { llc_set_station_handler(llc_station_rcv); } void llc_station_exit(void) { llc_set_station_handler(NULL); }
159 159 159 159 159 159 159 159 159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 // SPDX-License-Identifier: GPL-2.0 /* * device_cgroup.c - device cgroup subsystem * * Copyright 2007 IBM Corp */ #include <linux/bpf-cgroup.h> #include <linux/device_cgroup.h> #include <linux/cgroup.h> #include <linux/ctype.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/mutex.h> #ifdef CONFIG_CGROUP_DEVICE static DEFINE_MUTEX(devcgroup_mutex); enum devcg_behavior { DEVCG_DEFAULT_NONE, DEVCG_DEFAULT_ALLOW, DEVCG_DEFAULT_DENY, }; /* * exception list locking rules: * hold devcgroup_mutex for update/read. * hold rcu_read_lock() for read. */ struct dev_exception_item { u32 major, minor; short type; short access; struct list_head list; struct rcu_head rcu; }; struct dev_cgroup { struct cgroup_subsys_state css; struct list_head exceptions; enum devcg_behavior behavior; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_cgrp_id)); } /* * called under devcgroup_mutex */ static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp, *new; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry(ex, orig, list) { new = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!new) goto free_and_exit; list_add_tail(&new->list, dest); } return 0; free_and_exit: list_for_each_entry_safe(ex, tmp, dest, list) { list_del(&ex->list); kfree(ex); } return -ENOMEM; } static void dev_exceptions_move(struct list_head *dest, struct list_head *orig) { struct dev_exception_item *ex, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(ex, tmp, orig, list) { list_move_tail(&ex->list, dest); } } /* * called under devcgroup_mutex */ static int dev_exception_add(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *excopy, *walk; lockdep_assert_held(&devcgroup_mutex); excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL); if (!excopy) return -ENOMEM; list_for_each_entry(walk, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access |= ex->access; kfree(excopy); excopy = NULL; } if (excopy != NULL) list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions); return 0; } /* * called under devcgroup_mutex */ static void dev_exception_rm(struct dev_cgroup *dev_cgroup, struct dev_exception_item *ex) { struct dev_exception_item *walk, *tmp; lockdep_assert_held(&devcgroup_mutex); list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) { if (walk->type != ex->type) continue; if (walk->major != ex->major) continue; if (walk->minor != ex->minor) continue; walk->access &= ~ex->access; if (!walk->access) { list_del_rcu(&walk->list); kfree_rcu(walk, rcu); } } } static void __dev_exception_clean(struct dev_cgroup *dev_cgroup) { struct dev_exception_item *ex, *tmp; list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) { list_del_rcu(&ex->list); kfree_rcu(ex, rcu); } } /** * dev_exception_clean - frees all entries of the exception list * @dev_cgroup: dev_cgroup with the exception list to be cleaned * * called under devcgroup_mutex */ static void dev_exception_clean(struct dev_cgroup *dev_cgroup) { lockdep_assert_held(&devcgroup_mutex); __dev_exception_clean(dev_cgroup); } static inline bool is_devcg_online(const struct dev_cgroup *devcg) { return (devcg->behavior != DEVCG_DEFAULT_NONE); } /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's * @css: css getting online * returns 0 in case of success, error code otherwise */ static int devcgroup_online(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent); int ret = 0; mutex_lock(&devcgroup_mutex); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; else { ret = dev_exceptions_copy(&dev_cgroup->exceptions, &parent_dev_cgroup->exceptions); if (!ret) dev_cgroup->behavior = parent_dev_cgroup->behavior; } mutex_unlock(&devcgroup_mutex); return ret; } static void devcgroup_offline(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; mutex_unlock(&devcgroup_mutex); } /* * called from kernel/cgroup/cgroup.c with cgroup_lock() held. */ static struct cgroup_subsys_state * devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL); if (!dev_cgroup) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dev_cgroup->exceptions); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; return &dev_cgroup->css; } static void devcgroup_css_free(struct cgroup_subsys_state *css) { struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } #define DEVCG_ALLOW 1 #define DEVCG_DENY 2 #define DEVCG_LIST 3 #define MAJMINLEN 13 #define ACCLEN 4 static void set_access(char *acc, short access) { int idx = 0; memset(acc, 0, ACCLEN); if (access & DEVCG_ACC_READ) acc[idx++] = 'r'; if (access & DEVCG_ACC_WRITE) acc[idx++] = 'w'; if (access & DEVCG_ACC_MKNOD) acc[idx++] = 'm'; } static char type_to_char(short type) { if (type == DEVCG_DEV_ALL) return 'a'; if (type == DEVCG_DEV_CHAR) return 'c'; if (type == DEVCG_DEV_BLOCK) return 'b'; return 'X'; } static void set_majmin(char *str, unsigned m) { if (m == ~0) strcpy(str, "*"); else sprintf(str, "%u", m); } static int devcgroup_seq_show(struct seq_file *m, void *v) { struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; rcu_read_lock(); /* * To preserve the compatibility: * - Only show the "all devices" when the default policy is to allow * - List the exceptions in case the default policy is to deny * This way, the file remains as a "whitelist of devices" */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { set_access(acc, DEVCG_ACC_MASK); set_majmin(maj, ~0); set_majmin(min, ~0); seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL), maj, min, acc); } else { list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) { set_access(acc, ex->access); set_majmin(maj, ex->major); set_majmin(min, ex->minor); seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type), maj, min, acc); } } rcu_read_unlock(); return 0; } /** * match_exception - iterates the exception list trying to find a complete match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a complete match if an exception is found that will * contain the entire range of provided parameters. * * Return: true in case it matches an exception completely */ static bool match_exception(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; if (ex->major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && ex->minor != minor) continue; /* provided access cannot have more than the exception rule */ if (access & (~ex->access)) continue; return true; } return false; } /** * match_exception_partial - iterates the exception list trying to find a partial match * @exceptions: list of exceptions * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR) * @major: device file major number, ~0 to match all * @minor: device file minor number, ~0 to match all * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD) * * It is considered a partial match if an exception's range is found to * contain *any* of the devices specified by provided parameters. This is * used to make sure no extra access is being granted that is forbidden by * any of the exception list. * * Return: true in case the provided range mat matches an exception completely */ static bool match_exception_partial(struct list_head *exceptions, short type, u32 major, u32 minor, short access) { struct dev_exception_item *ex; list_for_each_entry_rcu(ex, exceptions, list, lockdep_is_held(&devcgroup_mutex)) { if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK)) continue; if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR)) continue; /* * We must be sure that both the exception and the provided * range aren't masking all devices */ if (ex->major != ~0 && major != ~0 && ex->major != major) continue; if (ex->minor != ~0 && minor != ~0 && ex->minor != minor) continue; /* * In order to make sure the provided range isn't matching * an exception, all its access bits shouldn't match the * exception's access bits */ if (!(access & ex->access)) continue; return true; } return false; } /** * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions * @dev_cgroup: dev cgroup to be tested against * @refex: new exception * @behavior: behavior of the exception's dev_cgroup * * This is used to make sure a child cgroup won't have more privileges * than its parent */ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, struct dev_exception_item *refex, enum devcg_behavior behavior) { bool match = false; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&devcgroup_mutex), "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { if (behavior == DEVCG_DEFAULT_ALLOW) { /* * new exception in the child doesn't matter, only * adding extra restrictions */ return true; } else { /* * new exception in the child will add more devices * that can be accessed, so it can't match any of * parent's exceptions, even slightly */ match = match_exception_partial(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) return false; return true; } } else { /* * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore * the new exception will add access to more devices and must * be contained completely in an parent's exception to be * allowed */ match = match_exception(&dev_cgroup->exceptions, refex->type, refex->major, refex->minor, refex->access); if (match) /* parent has an exception that matches the proposed */ return true; else return false; } return false; } /* * parent_has_perm: * when adding a new allow rule to a device exception list, the rule * must be allowed in the parent device */ static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return 1; return verify_new_ex(parent, ex, childcg->behavior); } /** * parent_allows_removal - verify if it's ok to remove an exception * @childcg: child cgroup from where the exception will be removed * @ex: exception being removed * * When removing an exception in cgroups with default ALLOW policy, it must * be checked if removing it will give the child cgroup more access than the * parent. * * Return: true if it's ok to remove exception, false otherwise */ static bool parent_allows_removal(struct dev_cgroup *childcg, struct dev_exception_item *ex) { struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent); if (!parent) return true; /* It's always allowed to remove access to devices */ if (childcg->behavior == DEVCG_DEFAULT_DENY) return true; /* * Make sure you're not removing part or a whole exception existing in * the parent cgroup */ return !match_exception_partial(&parent->exceptions, ex->type, ex->major, ex->minor, ex->access); } /** * may_allow_all - checks if it's possible to change the behavior to * allow based on parent's rules. * @parent: device cgroup's parent * returns: != 0 in case it's allowed, 0 otherwise */ static inline int may_allow_all(struct dev_cgroup *parent) { if (!parent) return 1; return parent->behavior == DEVCG_DEFAULT_ALLOW; } /** * revalidate_active_exceptions - walks through the active exception list and * revalidates the exceptions based on parent's * behavior and exceptions. The exceptions that * are no longer valid will be removed. * Called with devcgroup_mutex held. * @devcg: cgroup which exceptions will be checked * * This is one of the three key functions for hierarchy implementation. * This function is responsible for re-evaluating all the cgroup's active * exceptions due to a parent's exception change. * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details. */ static void revalidate_active_exceptions(struct dev_cgroup *devcg) { struct dev_exception_item *ex; struct list_head *this, *tmp; list_for_each_safe(this, tmp, &devcg->exceptions) { ex = container_of(this, struct dev_exception_item, list); if (!parent_has_perm(devcg, ex)) dev_exception_rm(devcg, ex); } } /** * propagate_exception - propagates a new exception to the children * @devcg_root: device cgroup that added a new exception * @ex: new exception to be propagated * * returns: 0 in case of success, != 0 in case of error */ static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); css_for_each_descendant_pre(pos, &devcg_root->css) { struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become * online or offline during the tree walk (see on/offline * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); /* * in case both root's behavior and devcg is allow, a new * restriction means adding to the exception list */ if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW && devcg->behavior == DEVCG_DEFAULT_ALLOW) { rc = dev_exception_add(devcg, ex); if (rc) return rc; } else { /* * in the other possible cases: * root's behavior: allow, devcg's: deny * root's behavior: deny, devcg's: deny * the exception will be removed */ dev_exception_rm(devcg, ex); } revalidate_active_exceptions(devcg); rcu_read_lock(); } rcu_read_unlock(); return rc; } /* * Modify the exception list using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD * so we can give a container CAP_MKNOD to let it create devices but not * modify the exception list. * It seems likely we'll want to add a CAP_CONTAINER capability to allow * us to also grant CAP_SYS_ADMIN to containers without giving away the * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN * * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting * new access is only allowed if you're in the top-level cgroup, or your * parent cgroup has the access you're asking for. */ static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, char *buffer) { const char *b; char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent); struct dev_cgroup tmp_devcgrp; if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(&ex, 0, sizeof(ex)); memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp)); b = buffer; switch (*b) { case 'a': switch (filetype) { case DEVCG_ALLOW: if (css_has_online_children(&devcgroup->css)) return -EINVAL; if (!may_allow_all(parent)) return -EPERM; if (!parent) { devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(devcgroup); break; } INIT_LIST_HEAD(&tmp_devcgrp.exceptions); rc = dev_exceptions_copy(&tmp_devcgrp.exceptions, &devcgroup->exceptions); if (rc) return rc; dev_exception_clean(devcgroup); rc = dev_exceptions_copy(&devcgroup->exceptions, &parent->exceptions); if (rc) { dev_exceptions_move(&devcgroup->exceptions, &tmp_devcgrp.exceptions); return rc; } devcgroup->behavior = DEVCG_DEFAULT_ALLOW; dev_exception_clean(&tmp_devcgrp); break; case DEVCG_DENY: if (css_has_online_children(&devcgroup->css)) return -EINVAL; dev_exception_clean(devcgroup); devcgroup->behavior = DEVCG_DEFAULT_DENY; break; default: return -EINVAL; } return 0; case 'b': ex.type = DEVCG_DEV_BLOCK; break; case 'c': ex.type = DEVCG_DEV_CHAR; break; default: return -EINVAL; } b++; if (!isspace(*b)) return -EINVAL; b++; if (*b == '*') { ex.major = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.major); if (rc) return -EINVAL; } else { return -EINVAL; } if (*b != ':') return -EINVAL; b++; /* read minor */ if (*b == '*') { ex.minor = ~0; b++; } else if (isdigit(*b)) { memset(temp, 0, sizeof(temp)); for (count = 0; count < sizeof(temp) - 1; count++) { temp[count] = *b; b++; if (!isdigit(*b)) break; } rc = kstrtou32(temp, 10, &ex.minor); if (rc) return -EINVAL; } else { return -EINVAL; } if (!isspace(*b)) return -EINVAL; for (b++, count = 0; count < 3; count++, b++) { switch (*b) { case 'r': ex.access |= DEVCG_ACC_READ; break; case 'w': ex.access |= DEVCG_ACC_WRITE; break; case 'm': ex.access |= DEVCG_ACC_MKNOD; break; case '\n': case '\0': count = 3; break; default: return -EINVAL; } } switch (filetype) { case DEVCG_ALLOW: /* * If the default policy is to allow by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { /* Check if the parent allows removing it first */ if (!parent_allows_removal(devcgroup, &ex)) return -EPERM; dev_exception_rm(devcgroup, &ex); break; } if (!parent_has_perm(devcgroup, &ex)) return -EPERM; rc = dev_exception_add(devcgroup, &ex); break; case DEVCG_DENY: /* * If the default policy is to deny by default, try to remove * an matching exception instead. And be silent about it: we * don't want to break compatibility */ if (devcgroup->behavior == DEVCG_DEFAULT_DENY) dev_exception_rm(devcgroup, &ex); else rc = dev_exception_add(devcgroup, &ex); if (rc) break; /* we only propagate new restrictions */ rc = propagate_exception(devcgroup, &ex); break; default: rc = -EINVAL; } return rc; } static ssize_t devcgroup_access_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int retval; mutex_lock(&devcgroup_mutex); retval = devcgroup_update_access(css_to_devcgroup(of_css(of)), of_cft(of)->private, strstrip(buf)); mutex_unlock(&devcgroup_mutex); return retval ?: nbytes; } static struct cftype dev_cgroup_files[] = { { .name = "allow", .write = devcgroup_access_write, .private = DEVCG_ALLOW, }, { .name = "deny", .write = devcgroup_access_write, .private = DEVCG_DENY, }, { .name = "list", .seq_show = devcgroup_seq_show, .private = DEVCG_LIST, }, { } /* terminate */ }; struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, .legacy_cftypes = dev_cgroup_files, }; /** * devcgroup_legacy_check_permission - checks if an inode operation is permitted * @type: device type * @major: device major number * @minor: device minor number * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD * * returns 0 on success, -EPERM case the operation is not permitted */ static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor, short access) { struct dev_cgroup *dev_cgroup; bool rc; rcu_read_lock(); dev_cgroup = task_devcgroup(current); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) /* Can't match any of the exceptions, even partially */ rc = !match_exception_partial(&dev_cgroup->exceptions, type, major, minor, access); else /* Need to match completely one exception to be allowed */ rc = match_exception(&dev_cgroup->exceptions, type, major, minor, access); rcu_read_unlock(); if (!rc) return -EPERM; return 0; } #endif /* CONFIG_CGROUP_DEVICE */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); #else /* CONFIG_CGROUP_DEVICE */ return 0; #endif /* CONFIG_CGROUP_DEVICE */ } EXPORT_SYMBOL(devcgroup_check_permission); #endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */
22 2 5 5 3 2 5 15 16 16 4 3 1 1 1 1 3 1 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 // SPDX-License-Identifier: GPL-2.0 #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/ktime.h> #include <linux/seq_file.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/nsfs.h> #include <linux/uaccess.h> #include <linux/mnt_namespace.h> #include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { .unlocked_ioctl = ns_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = d_inode(dentry); struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; return dynamic_dname(buffer, buflen, "%s:[%lu]", ns_ops->name, inode->i_ino); } const struct dentry_operations ns_dentry_operations = { .d_dname = ns_dname, .d_prune = stashed_dentry_prune, }; static void nsfs_evict(struct inode *inode) { struct ns_common *ns = inode->i_private; clear_inode(inode); ns->ops->put(ns); } int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb, void *private_data) { struct ns_common *ns; ns = ns_get_cb(private_data); if (!ns) return -ENOENT; return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path); } struct ns_get_path_task_args { const struct proc_ns_operations *ns_ops; struct task_struct *task; }; static struct ns_common *ns_get_path_task(void *private_data) { struct ns_get_path_task_args *args = private_data; return args->ns_ops->get(args->task); } int ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_get_path_task_args args = { .ns_ops = ns_ops, .task = task, }; return ns_get_path_cb(path, ns_get_path_task, &args); } /** * open_namespace - open a namespace * @ns: the namespace to open * * This will consume a reference to @ns indendent of success or failure. * * Return: A file descriptor on success or a negative error code on failure. */ int open_namespace(struct ns_common *ns) { struct path path __free(path_put) = {}; struct file *f; int err; /* call first to consume reference */ err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); if (err < 0) return err; CLASS(get_unused_fd, fd)(O_CLOEXEC); if (fd < 0) return fd; f = dentry_open(&path, O_RDONLY, current_cred()); if (IS_ERR(f)) return PTR_ERR(f); fd_install(fd, f); return take_fd(fd); } int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct ns_common *relative; relative = get_ns(ns); if (IS_ERR(relative)) return PTR_ERR(relative); return open_namespace(relative); } EXPORT_SYMBOL_GPL(open_related_ns); static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, struct mnt_ns_info __user *uinfo, size_t usize, struct mnt_ns_info *kinfo) { /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows aobut will be copied and * the size value will be set to the size the kernel knows about. */ kinfo->size = min(usize, sizeof(*kinfo)); kinfo->mnt_ns_id = mnt_ns->seq; kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) kinfo->nr_mounts--; if (copy_to_user(uinfo, kinfo, kinfo->size)) return -EFAULT; return 0; } static bool nsfs_ioctl_valid(unsigned int cmd) { switch (cmd) { case NS_GET_USERNS: case NS_GET_PARENT: case NS_GET_NSTYPE: case NS_GET_OWNER_UID: case NS_GET_MNTNS_ID: case NS_GET_PID_FROM_PIDNS: case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); } /* Extensible ioctls require some extra handling. */ switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_INFO): case _IOC_NR(NS_MNT_GET_NEXT): case _IOC_NR(NS_MNT_GET_PREV): return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); } return false; } static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct user_namespace *user_ns; struct pid_namespace *pid_ns; struct task_struct *tsk; struct ns_common *ns; struct mnt_namespace *mnt_ns; bool previous = false; uid_t __user *argp; uid_t uid; int ret; if (!nsfs_ioctl_valid(ioctl)) return -ENOIOCTLCMD; ns = get_proc_ns(file_inode(filp)); switch (ioctl) { case NS_GET_USERNS: return open_related_ns(ns, ns_get_owner); case NS_GET_PARENT: if (!ns->ops->get_parent) return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: return ns->ops->type; case NS_GET_OWNER_UID: if (ns->ops->type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); case NS_GET_MNTNS_ID: { __u64 __user *idp; __u64 id; if (ns->ops->type != CLONE_NEWNS) return -EINVAL; mnt_ns = container_of(ns, struct mnt_namespace, ns); idp = (__u64 __user *)arg; id = mnt_ns->seq; return put_user(id, idp); } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: fallthrough; case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { if (ns->ops->type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; pid_ns = container_of(ns, struct pid_namespace, ns); guard(rcu)(); if (ioctl == NS_GET_PID_IN_PIDNS || ioctl == NS_GET_TGID_IN_PIDNS) tsk = find_task_by_vpid(arg); else tsk = find_task_by_pid_ns(arg, pid_ns); if (!tsk) break; switch (ioctl) { case NS_GET_PID_FROM_PIDNS: ret = task_pid_vnr(tsk); break; case NS_GET_TGID_FROM_PIDNS: ret = task_tgid_vnr(tsk); break; case NS_GET_PID_IN_PIDNS: ret = task_pid_nr_ns(tsk, pid_ns); break; case NS_GET_TGID_IN_PIDNS: ret = task_tgid_nr_ns(tsk, pid_ns); break; default: ret = 0; break; } if (!ret) ret = -ESRCH; return ret; } } /* extensible ioctls */ switch (_IOC_NR(ioctl)) { case _IOC_NR(NS_MNT_GET_INFO): { struct mnt_ns_info kinfo = {}; struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); if (ns->ops->type != CLONE_NEWNS) return -EINVAL; if (!uinfo) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) return -EINVAL; return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); } case _IOC_NR(NS_MNT_GET_PREV): previous = true; fallthrough; case _IOC_NR(NS_MNT_GET_NEXT): { struct mnt_ns_info kinfo = {}; struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; struct path path __free(path_put) = {}; struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); if (ns->ops->type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) return -EINVAL; mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous); if (IS_ERR(mnt_ns)) return PTR_ERR(mnt_ns); ns = to_ns_common(mnt_ns); /* Transfer ownership of @mnt_ns reference to @path. */ ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); if (ret) return ret; CLASS(get_unused_fd, fd)(O_CLOEXEC); if (fd < 0) return fd; f = dentry_open(&path, O_RDONLY, current_cred()); if (IS_ERR(f)) return PTR_ERR(f); if (uinfo) { /* * If @uinfo is passed return all information about the * mount namespace as well. */ ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); if (ret) return ret; } /* Transfer reference of @f to caller's fdtable. */ fd_install(fd, no_free_ptr(f)); /* File descriptor is live so hand it off to the caller. */ return take_fd(fd); } default: ret = -ENOTTY; } return ret; } int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops) { struct ns_common *ns; int res = -ENOENT; const char *name; ns = ns_ops->get(task); if (ns) { name = ns_ops->real_ns_name ? : ns_ops->name; res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; } bool proc_ns_file(const struct file *file) { return file->f_op == &ns_file_operations; } /** * ns_match() - Returns true if current namespace matches dev/ino provided. * @ns: current namespace * @dev: dev_t from nsfs that will be matched against current nsfs * @ino: ino_t from nsfs that will be matched against current nsfs * * Return: true if dev and ino matches the current nsfs. */ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) { return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev); } static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); const struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); return 0; } static const struct super_operations nsfs_ops = { .statfs = simple_statfs, .evict_inode = nsfs_evict, .show_path = nsfs_show_path, }; static int nsfs_init_inode(struct inode *inode, void *data) { struct ns_common *ns = data; inode->i_private = data; inode->i_mode |= S_IRUGO; inode->i_fop = &ns_file_operations; inode->i_ino = ns->inum; return 0; } static void nsfs_put_data(void *data) { struct ns_common *ns = data; ns->ops->put(ns); } static const struct stashed_operations nsfs_stashed_ops = { .init_inode = nsfs_init_inode, .put_data = nsfs_put_data, }; static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; } static struct file_system_type nsfs = { .name = "nsfs", .init_fs_context = nsfs_init_fs_context, .kill_sb = kill_anon_super, }; void __init nsfs_init(void) { nsfs_mnt = kern_mount(&nsfs); if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; }