Total coverage: 130474 (7%)of 1973018
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/options.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Option parsing */ #include <linux/string.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/nls.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/slab.h> #include "hfsplus_fs.h" enum { opt_creator, opt_type, opt_umask, opt_uid, opt_gid, opt_part, opt_session, opt_nls, opt_decompose, opt_barrier, opt_force, }; static const struct fs_parameter_spec hfs_param_spec[] = { fsparam_string ("creator", opt_creator), fsparam_string ("type", opt_type), fsparam_u32oct ("umask", opt_umask), fsparam_u32 ("uid", opt_uid), fsparam_u32 ("gid", opt_gid), fsparam_u32 ("part", opt_part), fsparam_u32 ("session", opt_session), fsparam_string ("nls", opt_nls), fsparam_flag_no ("decompose", opt_decompose), fsparam_flag_no ("barrier", opt_barrier), fsparam_flag ("force", opt_force), {} }; /* Initialize an options object to reasonable defaults */ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) { if (!opts) return; opts->creator = HFSPLUS_DEF_CR_TYPE; opts->type = HFSPLUS_DEF_CR_TYPE; opts->umask = current_umask(); opts->uid = current_uid(); opts->gid = current_gid(); opts->part = -1; opts->session = -1; } /* Parse options from mount. Returns nonzero errno on failure */ int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hfsplus_sb_info *sbi = fc->s_fs_info; struct fs_parse_result result; int opt; /* * Only the force option is examined during remount, all others * are ignored. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && strncmp(param->key, "force", 5)) return 0; opt = fs_parse(fc, hfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case opt_creator: if (strlen(param->string) != 4) { pr_err("creator requires a 4 character value\n"); return -EINVAL; } memcpy(&sbi->creator, param->string, 4); break; case opt_type: if (strlen(param->string) != 4) { pr_err("type requires a 4 character value\n"); return -EINVAL; } memcpy(&sbi->type, param->string, 4); break; case opt_umask: sbi->umask = (umode_t)result.uint_32; break; case opt_uid: sbi->uid = result.uid; set_bit(HFSPLUS_SB_UID, &sbi->flags); break; case opt_gid: sbi->gid = result.gid; set_bit(HFSPLUS_SB_GID, &sbi->flags); break; case opt_part: sbi->part = result.uint_32; break; case opt_session: sbi->session = result.uint_32; break; case opt_nls: if (sbi->nls) { pr_err("unable to change nls mapping\n"); return -EINVAL; } sbi->nls = load_nls(param->string); if (!sbi->nls) { pr_err("unable to load nls mapping \"%s\"\n", param->string); return -EINVAL; } break; case opt_decompose: if (result.negated) set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); else clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags); break; case opt_barrier: if (result.negated) set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); else clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags); break; case opt_force: set_bit(HFSPLUS_SB_FORCE, &sbi->flags); break; default: return -EINVAL; } return 0; } int hfsplus_show_options(struct seq_file *seq, struct dentry *root) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); if (sbi->type != HFSPLUS_DEF_CR_TYPE) seq_show_option_n(seq, "type", (char *)&sbi->type, 4); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, from_kuid_munged(&init_user_ns, sbi->uid), from_kgid_munged(&init_user_ns, sbi->gid)); if (sbi->part >= 0) seq_printf(seq, ",part=%u", sbi->part); if (sbi->session >= 0) seq_printf(seq, ",session=%u", sbi->session); if (sbi->nls) seq_printf(seq, ",nls=%s", sbi->nls->charset); if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags)) seq_puts(seq, ",nodecompose"); if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) seq_puts(seq, ",nobarrier"); return 0; }
5 5 5 5 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "bat_iv_ogm.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/cache.h> #include <linux/container_of.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/pkt_sched.h> #include <linux/printk.h> #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/string_choices.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bitarray.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work); /** * enum batadv_dup_status - duplicate status */ enum batadv_dup_status { /** @BATADV_NO_DUP: the packet is no duplicate */ BATADV_NO_DUP = 0, /** * @BATADV_ORIG_DUP: OGM is a duplicate in the originator (but not for * the neighbor) */ BATADV_ORIG_DUP, /** @BATADV_NEIGH_DUP: OGM is a duplicate for the neighbor */ BATADV_NEIGH_DUP, /** * @BATADV_PROTECTED: originator is currently protected (after reboot) */ BATADV_PROTECTED, }; /** * batadv_ring_buffer_set() - update the ring buffer with the given value * @lq_recv: pointer to the ring buffer * @lq_index: index to store the value at * @value: value to store in the ring buffer */ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) { lq_recv[*lq_index] = value; *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; } /** * batadv_ring_buffer_avg() - compute the average of all non-zero values stored * in the given ring buffer * @lq_recv: pointer to the ring buffer * * Return: computed average value. */ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { const u8 *ptr; u16 count = 0; u16 i = 0; u16 sum = 0; ptr = lq_recv; while (i < BATADV_TQ_GLOBAL_WINDOW_SIZE) { if (*ptr != 0) { count++; sum += *ptr; } i++; ptr++; } if (count == 0) return 0; return (u8)(sum / count); } /** * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an * originator * @bat_priv: the bat priv with all the soft interface information * @addr: mac address of the originator * * Return: the originator object corresponding to the passed mac address or NULL * on failure. * If the object does not exist, it is created and initialised. */ static struct batadv_orig_node * batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int hash_added; orig_node = batadv_orig_hash_find(bat_priv, addr); if (orig_node) return orig_node; orig_node = batadv_orig_node_new(bat_priv, addr); if (!orig_node) return NULL; spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock); kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) goto free_orig_node_hash; return orig_node; free_orig_node_hash: /* reference for batadv_hash_add */ batadv_orig_node_put(orig_node); /* reference from batadv_orig_node_new */ batadv_orig_node_put(orig_node); return NULL; } static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); if (!neigh_node) goto out; neigh_node->orig_node = orig_neigh; out: return neigh_node; } static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; u32 random_seqno; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno); hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN; ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC); if (!ogm_buff) { mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return -ENOMEM; } hard_iface->bat_iv.ogm_buff = ogm_buff; batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff; batadv_ogm_packet->packet_type = BATADV_IV_OGM; batadv_ogm_packet->version = BATADV_COMPAT_VERSION; batadv_ogm_packet->ttl = 2; batadv_ogm_packet->flags = BATADV_NO_FLAGS; batadv_ogm_packet->reserved = 0; batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); return 0; } static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface) { mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); kfree(hard_iface->bat_iv.ogm_buff); hard_iface->bat_iv.ogm_buff = NULL; mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; ether_addr_copy(batadv_ogm_packet->orig, hard_iface->net_dev->dev_addr); ether_addr_copy(batadv_ogm_packet->prev_sender, hard_iface->net_dev->dev_addr); unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } static void batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; void *ogm_buff; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); ogm_buff = hard_iface->bat_iv.ogm_buff; if (!ogm_buff) goto unlock; batadv_ogm_packet = ogm_buff; batadv_ogm_packet->ttl = BATADV_TTL; unlock: mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /* when do we schedule our own ogm to be sent */ static unsigned long batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) { unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; msecs += get_random_u32_below(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { return jiffies + msecs_to_jiffies(get_random_u32_below(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) { int hop_penalty = atomic_read(&bat_priv->hop_penalty); int new_tq; new_tq = tq * (BATADV_TQ_MAX_VALUE - hop_penalty); new_tq /= BATADV_TQ_MAX_VALUE; return new_tq; } /** * batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached * @buff_pos: current position in the skb * @packet_len: total length of the skb * @ogm_packet: potential OGM in buffer * * Return: true if there is enough space for another OGM, false otherwise. */ static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, const struct batadv_ogm_packet *ogm_packet) { int next_buff_pos = 0; /* check if there is enough space for the header */ next_buff_pos += buff_pos + sizeof(*ogm_packet); if (next_buff_pos > packet_len) return false; /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); return (next_buff_pos <= packet_len) && (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); } /* send a batman ogm to a given interface */ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); const char *fwd_str; u8 packet_num; s16 buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; if (hard_iface->if_status != BATADV_IF_ACTIVE) return; packet_num = 0; buff_pos = 0; packet_pos = forw_packet->skb->data; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; /* adjust all flags and log packets */ while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len, batadv_ogm_packet)) { /* we might have aggregated direct link packets with an * ordinary base packet */ if (forw_packet->direct_link_flags & BIT(packet_num) && forw_packet->if_incoming == hard_iface) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; if (packet_num > 0 || !forw_packet->own) fwd_str = "Forwarding"; else fwd_str = "Sending own"; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s %spacket (originator %pM, seqno %u, TQ %d, TTL %d, IDF %s) on interface %s [%pM]\n", fwd_str, (packet_num > 0 ? "aggregated " : ""), batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, str_on_off(batadv_ogm_packet->flags & BATADV_DIRECTLINK), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); buff_pos += BATADV_OGM_HLEN; buff_pos += ntohs(batadv_ogm_packet->tvlv_len); packet_num++; packet_pos = forw_packet->skb->data + buff_pos; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_pos; } /* create clone because function is called more than once */ skb = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb) { batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES, skb->len + ETH_HLEN); batadv_send_broadcast_skb(skb, hard_iface); } } /* send a batman ogm packet */ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { struct net_device *soft_iface; if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); return; } soft_iface = forw_packet->if_incoming->soft_iface; if (WARN_ON(!forw_packet->if_outgoing)) return; if (forw_packet->if_outgoing->soft_iface != soft_iface) { pr_warn("%s: soft interface switch for queued OGM\n", __func__); return; } if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) return; /* only for one specific outgoing interface */ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); } /** * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an * existing forward packet * @new_bat_ogm_packet: OGM packet to be aggregated * @bat_priv: the bat priv with all the soft interface information * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @directlink: true if this is a direct link packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @forw_packet: the forwarded packet which should be checked * * Return: true if new_packet can be aggregated with forw_packet */ static bool batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, struct batadv_priv *bat_priv, int packet_len, unsigned long send_time, bool directlink, const struct batadv_hard_iface *if_incoming, const struct batadv_hard_iface *if_outgoing, const struct batadv_forw_packet *forw_packet) { struct batadv_ogm_packet *batadv_ogm_packet; int aggregated_bytes = forw_packet->packet_len + packet_len; struct batadv_hard_iface *primary_if = NULL; bool res = false; unsigned long aggregation_end_time; batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data; aggregation_end_time = send_time; aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* we can aggregate the current packet to this aggregated packet * if: * * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet won't be bigger than * MAX_AGGREGATION_BYTES * otherwise aggregation is not possible */ if (!time_before(send_time, forw_packet->send_time) || !time_after_eq(aggregation_end_time, forw_packet->send_time)) return false; if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES) return false; /* packet is not leaving on the same interface. */ if (forw_packet->if_outgoing != if_outgoing) return false; /* check aggregation compatibility * -> direct link packets are broadcasted on * their interface only * -> aggregate packet if the current packet is * a "global" packet as well as the base * packet */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return false; /* packets without direct link flag and high TTL * are flooded through the net */ if (!directlink && !(batadv_ogm_packet->flags & BATADV_DIRECTLINK) && batadv_ogm_packet->ttl != 1 && /* own packets originating non-primary * interfaces leave only that interface */ (!forw_packet->own || forw_packet->if_incoming == primary_if)) { res = true; goto out; } /* if the incoming packet is sent via this one * interface only - we still can aggregate */ if (directlink && new_bat_ogm_packet->ttl == 1 && forw_packet->if_incoming == if_incoming && /* packets from direct neighbors or * own secondary interface packets * (= secondary interface packets in general) */ (batadv_ogm_packet->flags & BATADV_DIRECTLINK || (forw_packet->own && forw_packet->if_incoming != primary_if))) { res = true; goto out; } out: batadv_hardif_put(primary_if); return res; } /** * batadv_iv_ogm_aggregate_new() - create a new aggregated packet and add this * packet to it. * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @direct_link: whether this OGM has direct link status * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm */ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, int packet_len, unsigned long send_time, bool direct_link, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_forw_packet *forw_packet_aggr; struct sk_buff *skb; unsigned char *skb_buff; unsigned int skb_size; atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; if (atomic_read(&bat_priv->aggregated_ogms) && packet_len < BATADV_MAX_AGGREGATION_BYTES) skb_size = BATADV_MAX_AGGREGATION_BYTES; else skb_size = packet_len; skb_size += ETH_HLEN; skb = netdev_alloc_skb_ip_align(NULL, skb_size); if (!skb) return; forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, queue_left, bat_priv, skb); if (!forw_packet_aggr) { kfree_skb(skb); return; } forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); skb_buff = skb_put(forw_packet_aggr->skb, packet_len); forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); forw_packet_aggr->own = own_packet; forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; forw_packet_aggr->send_time = send_time; /* save packet direct link flag status */ if (direct_link) forw_packet_aggr->direct_link_flags |= 1; INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, batadv_iv_send_outstanding_bat_ogm_packet); batadv_forw_packet_ogmv1_queue(bat_priv, forw_packet_aggr, send_time); } /* aggregate a new packet into the existing ogm packet */ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, const unsigned char *packet_buff, int packet_len, bool direct_link) { unsigned long new_direct_link_flag; skb_put_data(forw_packet_aggr->skb, packet_buff, packet_len); forw_packet_aggr->packet_len += packet_len; forw_packet_aggr->num_packets++; /* save packet direct link flag status */ if (direct_link) { new_direct_link_flag = BIT(forw_packet_aggr->num_packets); forw_packet_aggr->direct_link_flags |= new_direct_link_flag; } } /** * batadv_iv_ogm_queue_add() - queue up an OGM for transmission * @bat_priv: the bat priv with all the soft interface information * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @own_packet: true if it is a self-generated ogm * @send_time: timestamp (jiffies) when the packet is to be sent */ static void batadv_iv_ogm_queue_add(struct batadv_priv *bat_priv, unsigned char *packet_buff, int packet_len, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, int own_packet, unsigned long send_time) { /* _aggr -> pointer to the packet we want to aggregate with * _pos -> pointer to the position in the queue */ struct batadv_forw_packet *forw_packet_aggr = NULL; struct batadv_forw_packet *forw_packet_pos = NULL; struct batadv_ogm_packet *batadv_ogm_packet; bool direct_link; unsigned long max_aggregation_jiffies; batadv_ogm_packet = (struct batadv_ogm_packet *)packet_buff; direct_link = !!(batadv_ogm_packet->flags & BATADV_DIRECTLINK); max_aggregation_jiffies = msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); /* find position for the packet in the forward queue */ spin_lock_bh(&bat_priv->forw_bat_list_lock); /* own packets are not to be aggregated */ if (atomic_read(&bat_priv->aggregated_ogms) && !own_packet) { hlist_for_each_entry(forw_packet_pos, &bat_priv->forw_bat_list, list) { if (batadv_iv_ogm_can_aggregate(batadv_ogm_packet, bat_priv, packet_len, send_time, direct_link, if_incoming, if_outgoing, forw_packet_pos)) { forw_packet_aggr = forw_packet_pos; break; } } } /* nothing to aggregate with - either aggregation disabled or no * suitable aggregation packet found */ if (!forw_packet_aggr) { /* the following section can run without the lock */ spin_unlock_bh(&bat_priv->forw_bat_list_lock); /* if we could not aggregate this packet with one of the others * we hold it back for a while, so that it might be aggregated * later on */ if (!own_packet && atomic_read(&bat_priv->aggregated_ogms)) send_time += max_aggregation_jiffies; batadv_iv_ogm_aggregate_new(packet_buff, packet_len, send_time, direct_link, if_incoming, if_outgoing, own_packet); } else { batadv_iv_ogm_aggregate(forw_packet_aggr, packet_buff, packet_len, direct_link); spin_unlock_bh(&bat_priv->forw_bat_list_lock); } } static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, const struct ethhdr *ethhdr, struct batadv_ogm_packet *batadv_ogm_packet, bool is_single_hop_neigh, bool is_from_best_next_hop, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); u16 tvlv_len; if (batadv_ogm_packet->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); return; } if (!is_from_best_next_hop) { /* Mark the forwarded packet when it is not coming from our * best next hop. We still need to forward the packet for our * neighbor link quality detection to work in case the packet * originated from a single hop neighbor. Otherwise we can * simply drop the ogm. */ if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_NOT_BEST_NEXT_HOP; else return; } tvlv_len = ntohs(batadv_ogm_packet->tvlv_len); batadv_ogm_packet->ttl--; ether_addr_copy(batadv_ogm_packet->prev_sender, ethhdr->h_source); /* apply hop penalty */ batadv_ogm_packet->tq = batadv_hop_penalty(batadv_ogm_packet->tq, bat_priv); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: tq: %i, ttl: %i\n", batadv_ogm_packet->tq, batadv_ogm_packet->ttl); if (is_single_hop_neigh) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else batadv_ogm_packet->flags &= ~BATADV_DIRECTLINK; batadv_iv_ogm_queue_add(bat_priv, (unsigned char *)batadv_ogm_packet, BATADV_OGM_HLEN + tvlv_len, if_incoming, if_outgoing, 0, batadv_iv_ogm_fwd_send_time()); } /** * batadv_iv_ogm_slide_own_bcast_window() - bitshift own OGM broadcast windows * for the given interface * @hard_iface: the interface for which the windows have to be shifted */ static void batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; unsigned long *word; u32 i; u8 *w; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { hlist_for_each_entry_rcu(orig_ifinfo, &orig_node->ifinfo_list, list) { if (orig_ifinfo->if_outgoing != hard_iface) continue; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); word = orig_ifinfo->bat_iv.bcast_own; batadv_bit_get_packet(bat_priv, word, 1, 0); w = &orig_ifinfo->bat_iv.bcast_own_sum; *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); } } rcu_read_unlock(); } } /** * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer * @hard_iface: interface whose ogm buffer should be transmitted */ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; u32 seqno; u16 tvlv_len = 0; unsigned long send_time; lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex); /* interface already disabled by batadv_iv_ogm_iface_disable */ if (!*ogm_buff) return; /* the interface gets activated here to avoid race conditions between * the moment of activating the interface in * hardif_activate_interface() where the originator mac is set and * outdated packets (especially uninitialized mac addresses) in the * packet queue */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; primary_if = batadv_primary_if_get_selected(bat_priv); if (hard_iface == primary_if) { /* tt changes have to be committed before the tvlv data is * appended as it may alter the tt tvlv container */ batadv_tt_local_commit_changes(bat_priv); tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, ogm_buff, ogm_buff_len, BATADV_OGM_HLEN); } batadv_ogm_packet = (struct batadv_ogm_packet *)(*ogm_buff); batadv_ogm_packet->tvlv_len = htons(tvlv_len); /* change sequence number to network order */ seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno); batadv_ogm_packet->seqno = htonl(seqno); atomic_inc(&hard_iface->bat_iv.ogm_seqno); batadv_iv_ogm_slide_own_bcast_window(hard_iface); send_time = batadv_iv_ogm_emit_send_time(bat_priv); if (hard_iface != primary_if) { /* OGMs from secondary interfaces are only scheduled on their * respective interfaces. */ batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, hard_iface, 1, send_time); goto out; } /* OGMs from primary interfaces are scheduled on all * interfaces. */ rcu_read_lock(); list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) continue; if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) continue; batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); batadv_hardif_put(tmp_hard_iface); } rcu_read_unlock(); out: batadv_hardif_put(primary_if); } static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) return; mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex); batadv_iv_ogm_schedule_buff(hard_iface); mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex); } /** * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface * @orig_node: originator which reproadcasted the OGMs directly * @if_outgoing: interface which transmitted the original OGM and received the * direct rebroadcast * * Return: Number of replied (rebroadcasted) OGMs which were transmitted by * an originator and directly (without intermediate hop) received by a specific * interface */ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_orig_ifinfo *orig_ifinfo; u8 sum; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing); if (!orig_ifinfo) return 0; spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); sum = orig_ifinfo->bat_iv.bcast_own_sum; spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); return sum; } /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: the orig node who originally emitted the ogm packet * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node * @ethhdr: Ethernet header of the OGM * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * @dup_status: the duplicate status of this ogm packet. */ static void batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo *orig_ifinfo, const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, enum batadv_dup_status dup_status) { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; u8 sum_orig, sum_neigh; u8 *neigh_addr; u8 tq_avg; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s(): Searching and updating originator entry of received packet\n", __func__); rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { neigh_addr = tmp_neigh_node->addr; if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && tmp_neigh_node->if_incoming == if_incoming && kref_get_unless_zero(&tmp_neigh_node->refcount)) { if (WARN(neigh_node, "too many matching neigh_nodes")) batadv_neigh_node_put(neigh_node); neigh_node = tmp_neigh_node; continue; } if (dup_status != BATADV_NO_DUP) continue; /* only update the entry for this outgoing interface */ neigh_ifinfo = batadv_neigh_ifinfo_get(tmp_neigh_node, if_outgoing); if (!neigh_ifinfo) continue; spin_lock_bh(&tmp_neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, 0); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&tmp_neigh_node->ifinfo_lock); batadv_neigh_ifinfo_put(neigh_ifinfo); neigh_ifinfo = NULL; } if (!neigh_node) { struct batadv_orig_node *orig_tmp; orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_tmp) goto unlock; neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, orig_node, orig_tmp); batadv_orig_node_put(orig_tmp); if (!neigh_node) goto unlock; } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Updating existing last-hop neighbor of originator\n"); } rcu_read_unlock(); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) goto out; neigh_node->last_seen = jiffies; spin_lock_bh(&neigh_node->ifinfo_lock); batadv_ring_buffer_set(neigh_ifinfo->bat_iv.tq_recv, &neigh_ifinfo->bat_iv.tq_index, batadv_ogm_packet->tq); tq_avg = batadv_ring_buffer_avg(neigh_ifinfo->bat_iv.tq_recv); neigh_ifinfo->bat_iv.tq_avg = tq_avg; spin_unlock_bh(&neigh_node->ifinfo_lock); if (dup_status == BATADV_NO_DUP) { orig_ifinfo->last_ttl = batadv_ogm_packet->ttl; neigh_ifinfo->last_ttl = batadv_ogm_packet->ttl; } /* if this neighbor already is our next hop there is nothing * to change */ router = batadv_orig_router_get(orig_node, if_outgoing); if (router == neigh_node) goto out; if (router) { router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); if (!router_ifinfo) goto out; /* if this neighbor does not offer a better TQ we won't * consider it */ if (router_ifinfo->bat_iv.tq_avg > neigh_ifinfo->bat_iv.tq_avg) goto out; } /* if the TQ is the same and the link not more symmetric we * won't consider it either */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, router->if_incoming); sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, neigh_node->if_incoming); if (sum_orig >= sum_neigh) goto out; } batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); goto out; unlock: rcu_read_unlock(); out: batadv_neigh_node_put(neigh_node); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(neigh_ifinfo); batadv_neigh_ifinfo_put(router_ifinfo); } /** * batadv_iv_ogm_calc_tq() - calculate tq for current received ogm packet * @orig_node: the orig node who originally emitted the ogm packet * @orig_neigh_node: the orig node struct of the neighbor who sent the packet * @batadv_ogm_packet: the ogm packet * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: true if the link can be considered bidirectional, false otherwise */ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh_node, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; bool ret = false; /* find corresponding one hop neighbor */ rcu_read_lock(); hlist_for_each_entry_rcu(tmp_neigh_node, &orig_neigh_node->neigh_list, list) { if (!batadv_compare_eth(tmp_neigh_node->addr, orig_neigh_node->orig)) continue; if (tmp_neigh_node->if_incoming != if_incoming) continue; if (!kref_get_unless_zero(&tmp_neigh_node->refcount)) continue; neigh_node = tmp_neigh_node; break; } rcu_read_unlock(); if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, orig_neigh_node, orig_neigh_node); if (!neigh_node) goto out; /* if orig_node is direct neighbor update neigh_node last_seen */ if (orig_node == orig_neigh_node) neigh_node->last_seen = jiffies; orig_node->last_seen = jiffies; /* find packet count of corresponding one hop neighbor */ orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming); neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (neigh_ifinfo) { neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } else { neigh_rq_count = 0; } /* pay attention to not get a value bigger than 100 % */ if (orig_eq_count > neigh_rq_count) total_count = neigh_rq_count; else total_count = orig_eq_count; /* if we have too few packets (too less data) we set tq_own to zero * if we receive too few packets it is not considered bidirectional */ if (total_count < BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM || neigh_rq_count < BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM) tq_own = 0; else /* neigh_node->real_packet_count is never zero as we * only purge old information when getting new * information */ tq_own = (BATADV_TQ_MAX_VALUE * total_count) / neigh_rq_count; /* 1 - ((1-x) ** 3), normalized to TQ_MAX_VALUE this does * affect the nearly-symmetric links only a little, but * punishes asymmetric links more. This will give a value * between 0 and TQ_MAX_VALUE */ neigh_rq_inv = BATADV_TQ_LOCAL_WINDOW_SIZE - neigh_rq_count; neigh_rq_inv_cube = neigh_rq_inv * neigh_rq_inv * neigh_rq_inv; neigh_rq_max_cube = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE; inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube; inv_asym_penalty /= neigh_rq_max_cube; tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty; tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty); /* penalize if the OGM is forwarded on the same interface. WiFi * interfaces and other half duplex devices suffer from throughput * drops as they can't send and receive at the same time. */ if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty, bat_priv); combined_tq = batadv_ogm_packet->tq * tq_own * tq_asym_penalty * tq_iface_hop_penalty; combined_tq /= BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE * BATADV_TQ_MAX_VALUE; batadv_ogm_packet->tq = combined_tq; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n", orig_node->orig, orig_neigh_node->orig, total_count, neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_hop_penalty, batadv_ogm_packet->tq, if_incoming->net_dev->name, if_outgoing ? if_outgoing->net_dev->name : "DEFAULT"); /* if link has the minimum required transmission quality * consider it bidirectional */ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT) ret = true; out: batadv_neigh_node_put(neigh_node); return ret; } /** * batadv_iv_ogm_update_seqnos() - process a batman packet for all interfaces, * adjust the sequence number and find out whether it is a duplicate * @ethhdr: ethernet header of the packet * @batadv_ogm_packet: OGM packet to be considered * @if_incoming: interface on which the OGM packet was received * @if_outgoing: interface for which the retransmission should be considered * * Return: duplicate status as enum batadv_dup_status */ static enum batadv_dup_status batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, const struct batadv_ogm_packet *batadv_ogm_packet, const struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; bool is_dup; s32 seq_diff; bool need_update = false; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; u32 seqno = ntohl(batadv_ogm_packet->seqno); u8 *neigh_addr; u8 packet_count; unsigned long *bitmap; orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig); if (!orig_node) return BATADV_NO_DUP; orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (WARN_ON(!orig_ifinfo)) { batadv_orig_node_put(orig_node); return 0; } spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); seq_diff = seqno - orig_ifinfo->last_real_seqno; /* signalize caller that the packet is to be dropped. */ if (!hlist_empty(&orig_node->neigh_list) && batadv_window_protected(bat_priv, seq_diff, BATADV_TQ_LOCAL_WINDOW_SIZE, &orig_ifinfo->batman_seqno_reset, NULL)) { ret = BATADV_PROTECTED; goto out; } rcu_read_lock(); hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing); if (!neigh_ifinfo) continue; neigh_addr = neigh_node->addr; is_dup = batadv_test_bit(neigh_ifinfo->bat_iv.real_bits, orig_ifinfo->last_real_seqno, seqno); if (batadv_compare_eth(neigh_addr, ethhdr->h_source) && neigh_node->if_incoming == if_incoming) { set_mark = 1; if (is_dup) ret = BATADV_NEIGH_DUP; } else { set_mark = 0; if (is_dup && ret != BATADV_NEIGH_DUP) ret = BATADV_ORIG_DUP; } /* if the window moved, set the update flag. */ bitmap = neigh_ifinfo->bat_iv.real_bits; need_update |= batadv_bit_get_packet(bat_priv, bitmap, seq_diff, set_mark); packet_count = bitmap_weight(bitmap, BATADV_TQ_LOCAL_WINDOW_SIZE); neigh_ifinfo->bat_iv.real_packet_count = packet_count; batadv_neigh_ifinfo_put(neigh_ifinfo); } rcu_read_unlock(); if (need_update) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s updating last_seqno: old %u, new %u\n", if_outgoing ? if_outgoing->net_dev->name : "DEFAULT", orig_ifinfo->last_real_seqno, seqno); orig_ifinfo->last_real_seqno = seqno; } out: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_node_put(orig_node); batadv_orig_ifinfo_put(orig_ifinfo); return ret; } /** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface * @skb: the skb containing the OGM * @ogm_offset: offset from skb->data to start of ogm header * @orig_node: the (cached) orig node for the originator of this OGM * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered */ static void batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_node *router_router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *orig_neigh_router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_ogm_packet *ogm_packet; enum batadv_dup_status dup_status; bool is_from_best_next_hop = false; bool is_single_hop_neigh = false; bool sameseq, similar_ttl; struct sk_buff *skb_priv; struct ethhdr *ethhdr; u8 *prev_sender; bool is_bidirect; /* create a private copy of the skb, as some functions change tq value * and/or flags. */ skb_priv = skb_copy(skb, GFP_ATOMIC); if (!skb_priv) return; ethhdr = eth_hdr(skb_priv); ogm_packet = (struct batadv_ogm_packet *)(skb_priv->data + ogm_offset); dup_status = batadv_iv_ogm_update_seqnos(ethhdr, ogm_packet, if_incoming, if_outgoing); if (batadv_compare_eth(ethhdr->h_source, ogm_packet->orig)) is_single_hop_neigh = true; if (dup_status == BATADV_PROTECTED) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: packet within seqno protection time (sender: %pM)\n", ethhdr->h_source); goto out; } if (ogm_packet->tq == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet with tq equal 0\n"); goto out; } if (is_single_hop_neigh) { hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source); if (hardif_neigh) hardif_neigh->last_seen = jiffies; } router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { router_router = batadv_orig_router_get(router->orig_node, if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } if ((router_ifinfo && router_ifinfo->bat_iv.tq_avg != 0) && (batadv_compare_eth(router->addr, ethhdr->h_source))) is_from_best_next_hop = true; prev_sender = ogm_packet->prev_sender; /* avoid temporary routing loops */ if (router && router_router && (batadv_compare_eth(router->addr, prev_sender)) && !(batadv_compare_eth(ogm_packet->orig, prev_sender)) && (batadv_compare_eth(router->addr, router_router->addr))) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast packets that may make me loop (sender: %pM)\n", ethhdr->h_source); goto out; } if (if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_ogm_receive(bat_priv, ogm_packet, orig_node); /* if sender is a direct neighbor the sender mac equals * originator mac */ if (is_single_hop_neigh) orig_neigh_node = orig_node; else orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) goto out; /* Update nc_nodes of the originator */ batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, ogm_packet, is_single_hop_neigh); orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ if (!is_single_hop_neigh && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out_neigh; } is_bidirect = batadv_iv_ogm_calc_tq(orig_node, orig_neigh_node, ogm_packet, if_incoming, if_outgoing); /* update ranking if it is not a duplicate or has the same * seqno and similar ttl as the non-duplicate */ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); if (!orig_ifinfo) goto out_neigh; sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; if (is_bidirect && (dup_status == BATADV_NO_DUP || (sameseq && similar_ttl))) { batadv_iv_ogm_orig_update(bat_priv, orig_node, orig_ifinfo, ethhdr, ogm_packet, if_incoming, if_outgoing, dup_status); } batadv_orig_ifinfo_put(orig_ifinfo); /* only forward for specific interface, not for the default one. */ if (if_outgoing == BATADV_IF_DEFAULT) goto out_neigh; /* is single hop (direct) neighbor */ if (is_single_hop_neigh) { /* OGMs from secondary interfaces should only scheduled once * per interface where it has been received, not multiple times */ if (ogm_packet->ttl <= 2 && if_incoming != if_outgoing) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); goto out_neigh; } /* mark direct link on incoming interface */ batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast neighbor packet with direct link flag\n"); goto out_neigh; } /* multihop originator */ if (!is_bidirect) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: not received via bidirectional link\n"); goto out_neigh; } if (dup_status == BATADV_NEIGH_DUP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: duplicate packet received\n"); goto out_neigh; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding packet: rebroadcast originator packet\n"); batadv_iv_ogm_forward(orig_node, ethhdr, ogm_packet, is_single_hop_neigh, is_from_best_next_hop, if_incoming, if_outgoing); out_neigh: if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); batadv_neigh_node_put(router_router); batadv_neigh_node_put(orig_neigh_router); batadv_hardif_neigh_put(hardif_neigh); consume_skb(skb_priv); } /** * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it * @ogm_packet: rebroadcast OGM packet to process * @if_incoming: the interface where this packet was received * @orig_node: originator which reproadcasted the OGMs * @if_incoming_seqno: OGM sequence number when rebroadcast was received */ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, struct batadv_hard_iface *if_incoming, struct batadv_orig_node *orig_node, u32 if_incoming_seqno) { struct batadv_orig_ifinfo *orig_ifinfo; s32 bit_pos; u8 *weight; /* neighbor has to indicate direct link and it has to * come via the corresponding interface */ if (!(ogm_packet->flags & BATADV_DIRECTLINK)) return; if (!batadv_compare_eth(if_incoming->net_dev->dev_addr, ogm_packet->orig)) return; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming); if (!orig_ifinfo) return; /* save packet seqno for bidirectional check */ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); bit_pos = if_incoming_seqno - 2; bit_pos -= ntohl(ogm_packet->seqno); batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos); weight = &orig_ifinfo->bat_iv.bcast_own_sum; *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE); spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); batadv_orig_ifinfo_put(orig_ifinfo); } /** * batadv_iv_ogm_process() - process an incoming batman iv OGM * @skb: the skb containing the OGM * @ogm_offset: offset to the OGM which should be processed (for aggregates) * @if_incoming: the interface where this packet was received */ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_orig_node *orig_neigh_node, *orig_node; struct batadv_hard_iface *hard_iface; struct batadv_ogm_packet *ogm_packet; u32 if_incoming_seqno; bool has_directlink_flag; struct ethhdr *ethhdr; bool is_my_oldorig = false; bool is_my_addr = false; bool is_my_orig = false; ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset); ethhdr = eth_hdr(skb); /* Silently drop when the batman packet is actually not a * correct packet. * * This might happen if a packet is padded (e.g. Ethernet has a * minimum frame length of 64 byte) and the aggregation interprets * it as an additional length. * * TODO: A more sane solution would be to have a bit in the * batadv_ogm_packet to detect whether the packet is the last * packet in an aggregation. Here we expect that the padding * is always zero (or not 0x01) */ if (ogm_packet->packet_type != BATADV_IV_OGM) return; /* could be changed by schedule_own_packet() */ if_incoming_seqno = atomic_read(&if_incoming->bat_iv.ogm_seqno); if (ogm_packet->flags & BATADV_DIRECTLINK) has_directlink_flag = true; else has_directlink_flag = false; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Received BATMAN packet via NB: %pM, IF: %s [%pM] (from OG: %pM, via prev OG: %pM, seqno %u, tq %d, TTL %d, V %d, IDF %d)\n", ethhdr->h_source, if_incoming->net_dev->name, if_incoming->net_dev->dev_addr, ogm_packet->orig, ogm_packet->prev_sender, ntohl(ogm_packet->seqno), ogm_packet->tq, ogm_packet->ttl, ogm_packet->version, has_directlink_flag); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != if_incoming->soft_iface) continue; if (batadv_compare_eth(ethhdr->h_source, hard_iface->net_dev->dev_addr)) is_my_addr = true; if (batadv_compare_eth(ogm_packet->orig, hard_iface->net_dev->dev_addr)) is_my_orig = true; if (batadv_compare_eth(ogm_packet->prev_sender, hard_iface->net_dev->dev_addr)) is_my_oldorig = true; } rcu_read_unlock(); if (is_my_addr) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: received my own broadcast (sender: %pM)\n", ethhdr->h_source); return; } if (is_my_orig) { orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) return; batadv_iv_ogm_process_reply(ogm_packet, if_incoming, orig_neigh_node, if_incoming_seqno); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: originator packet from myself (via neighbor)\n"); batadv_orig_node_put(orig_neigh_node); return; } if (is_my_oldorig) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all rebroadcast echos (sender: %pM)\n", ethhdr->h_source); return; } if (ogm_packet->flags & BATADV_NOT_BEST_NEXT_HOP) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: ignoring all packets not forwarded from the best next hop (sender: %pM)\n", ethhdr->h_source); return; } orig_node = batadv_iv_ogm_orig_get(bat_priv, ogm_packet->orig); if (!orig_node) return; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, hard_iface); batadv_hardif_put(hard_iface); } rcu_read_unlock(); batadv_orig_node_put(orig_node); } static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_forw_packet *forw_packet; struct batadv_priv *bat_priv; bool dropped = false; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; goto out; } batadv_iv_ogm_emit(forw_packet); /* we have to have at least one packet in the queue to determine the * queues wake up time unless we are shutting down. * * only re-schedule if this is the "original" copy, e.g. the OGM of the * primary interface should only be rescheduled once per period, but * this function will be called for the forw_packet instances of the * other secondary interfaces as well. */ if (forw_packet->own && forw_packet->if_incoming == forw_packet->if_outgoing) batadv_iv_ogm_schedule(forw_packet->if_incoming); out: /* do we get something for free()? */ if (batadv_forw_packet_steal(forw_packet, &bat_priv->forw_bat_list_lock)) batadv_forw_packet_free(forw_packet, dropped); } static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm_packet *ogm_packet; u8 *packet_pos; int ogm_offset; bool res; int ret = NET_RX_DROP; res = batadv_check_management_packet(skb, if_incoming, BATADV_OGM_HLEN); if (!res) goto free_skb; /* did we receive a B.A.T.M.A.N. IV OGM packet on an interface * that does not have B.A.T.M.A.N. IV enabled ? */ if (bat_priv->algo_ops->iface.enable != batadv_iv_ogm_iface_enable) goto free_skb; batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX); batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES, skb->len + ETH_HLEN); ogm_offset = 0; ogm_packet = (struct batadv_ogm_packet *)skb->data; /* unpack the aggregated packets and process them one by one */ while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb), ogm_packet)) { batadv_iv_ogm_process(skb, ogm_offset, if_incoming); ogm_offset += BATADV_OGM_HLEN; ogm_offset += ntohs(ogm_packet->tvlv_len); packet_pos = skb->data + ogm_offset; ogm_packet = (struct batadv_ogm_packet *)packet_pos; } ret = NET_RX_SUCCESS; free_skb: if (ret == NET_RX_SUCCESS) consume_skb(skb); else kfree_skb(skb); return ret; } /** * batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a * given outgoing interface. * @neigh_node: Neighbour of interest * @if_outgoing: Outgoing interface of interest * @tq_avg: Pointer of where to store the TQ average * * Return: False if no average TQ available, otherwise true. */ static bool batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_outgoing, u8 *tq_avg) { struct batadv_neigh_ifinfo *n_ifinfo; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return false; *tq_avg = n_ifinfo->bat_iv.tq_avg; batadv_neigh_ifinfo_put(n_ifinfo); return true; } /** * batadv_iv_ogm_orig_dump_subentry() - Dump an originator subentry into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { void *hdr; u8 tq_avg; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg)) return 0; if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; u8 tq_avg_best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing, &tq_avg_best)) goto out; if (tq_avg_best == 0) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_iv_ogm_orig_dump_bucket() - Dump an originator bucket into a * message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_iv_ogm_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_iv_ogm_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } /** * batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * @diff: pointer to integer receiving the calculated difference * * The content of *@diff is only valid when this function returns true. * It is less, equal to or greater than 0 if the metric via neigh1 is lower, * the same as or higher than the metric via neigh2 * * Return: true when the difference could be calculated, false otherwise */ static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2, int *diff) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; u8 tq1, tq2; bool ret = true; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!neigh1_ifinfo || !neigh2_ifinfo) { ret = false; goto out; } tq1 = neigh1_ifinfo->bat_iv.tq_avg; tq2 = neigh2_ifinfo->bat_iv.tq_avg; *diff = (int)tq1 - (int)tq2; out: batadv_neigh_ifinfo_put(neigh1_ifinfo); batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } /** * batadv_iv_ogm_neigh_dump_neigh() - Dump a neighbour into a netlink message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_iv_ogm_neigh_dump_hardif() - Dump the neighbours of a hard interface * into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @hard_iface: Hard interface to dump the neighbours for * @idx_s: Number of entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @single_hardif: Limit dump to this hard interface */ static void batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (i_hardif++ < i_hardif_s) continue; if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_iv_ogm_neigh_cmp() - compare the metrics of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: a value less, equal to or greater than 0 if the metric via neigh1 is * lower, the same as or higher than the metric via neigh2 */ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return 0; return diff; } /** * batadv_iv_ogm_neigh_is_sob() - check if neigh1 is similarly good or better * than neigh2 from the metric prospective * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor * * Return: true if the metric via neigh1 is equally good or better than * the metric via neigh2, false otherwise. */ static bool batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { bool ret; int diff; ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, if_outgoing2, &diff); if (!ret) return false; ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD; return ret; } static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) { /* begin scheduling originator messages on that interface */ batadv_iv_ogm_schedule(hard_iface); } /** * batadv_iv_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) { /* set default TQ difference threshold to 20 */ atomic_set(&bat_priv->gw.sel_class, 20); } static struct batadv_gw_node * batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; u64 max_gw_factor = 0; u64 tmp_gw_factor = 0; u8 max_tq = 0; u8 tq_avg; struct batadv_orig_node *orig_node; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) continue; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto next; if (!kref_get_unless_zero(&gw_node->refcount)) goto next; tq_avg = router_ifinfo->bat_iv.tq_avg; switch (atomic_read(&bat_priv->gw.sel_class)) { case 1: /* fast connection */ tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; tmp_gw_factor *= 100 * 100; tmp_gw_factor >>= 18; if (tmp_gw_factor > max_gw_factor || (tmp_gw_factor == max_gw_factor && tq_avg > max_tq)) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; default: /* 2: stable connection (use best statistic) * 3: fast-switch (use best statistic but change as * soon as a better gateway appears) * XX: late-switch (use best statistic but change as * soon as a better gateway appears which has * $routing_class more tq points) */ if (tq_avg > max_tq) { batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } break; } if (tq_avg > max_tq) max_tq = tq_avg; if (tmp_gw_factor > max_gw_factor) max_gw_factor = tmp_gw_factor; batadv_gw_node_put(gw_node); next: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); return curr_gw; } static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL; struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL; struct batadv_neigh_node *router_gw = NULL; struct batadv_neigh_node *router_orig = NULL; u8 gw_tq_avg, orig_tq_avg; bool ret = false; /* dynamic re-election is performed only on fast or late switch */ if (atomic_read(&bat_priv->gw.sel_class) <= 2) return false; router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); if (!router_gw) { ret = true; goto out; } router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw, BATADV_IF_DEFAULT); if (!router_gw_ifinfo) { ret = true; goto out; } router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router_orig) goto out; router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig, BATADV_IF_DEFAULT); if (!router_orig_ifinfo) goto out; gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg; orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg; /* the TQ value has to be better */ if (orig_tq_avg < gw_tq_avg) goto out; /* if the routing class is greater than 3 the value tells us how much * greater the TQ value of the new gateway must be */ if ((atomic_read(&bat_priv->gw.sel_class) > 3) && (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", gw_tq_avg, orig_tq_avg); ret = true; out: batadv_neigh_ifinfo_put(router_gw_ifinfo); batadv_neigh_ifinfo_put(router_orig_ifinfo); batadv_neigh_node_put(router_gw); batadv_neigh_node_put(router_orig); return ret; } /** * batadv_iv_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig) || nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) || nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down) || nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_iv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information */ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .name = "BATMAN_IV", .iface = { .enable = batadv_iv_ogm_iface_enable, .enabled = batadv_iv_iface_enabled, .disable = batadv_iv_ogm_iface_disable, .update_mac = batadv_iv_ogm_iface_update_mac, .primary_set = batadv_iv_ogm_primary_iface_set, }, .neigh = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, .dump = batadv_iv_ogm_neigh_dump, }, .orig = { .dump = batadv_iv_ogm_orig_dump, }, .gw = { .init_sel_class = batadv_iv_init_sel_class, .sel_class_max = BATADV_TQ_MAX_VALUE, .get_best_gw_node = batadv_iv_gw_get_best_gw_node, .is_eligible = batadv_iv_gw_is_eligible, .dump = batadv_iv_gw_dump, }, }; /** * batadv_iv_init() - B.A.T.M.A.N. IV initialization function * * Return: 0 on success or negative error number in case of failure */ int __init batadv_iv_init(void) { int ret; /* batman originator packet */ ret = batadv_recv_handler_register(BATADV_IV_OGM, batadv_iv_ogm_receive); if (ret < 0) goto out; ret = batadv_algo_register(&batadv_batman_iv); if (ret < 0) goto handler_unregister; goto out; handler_unregister: batadv_recv_handler_unregister(BATADV_IV_OGM); out: return ret; }
705 709 709 709 709 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * netprio_cgroup.h Control Group Priority set * * Authors: Neil Horman <nhorman@tuxdriver.com> */ #ifndef _NETPRIO_CGROUP_H #define _NETPRIO_CGROUP_H #include <linux/cgroup.h> #include <linux/hardirq.h> #include <linux/rcupdate.h> #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map { struct rcu_head rcu; u32 priomap_len; u32 priomap[]; }; static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; u32 idx; rcu_read_lock(); css = task_css(p, net_prio_cgrp_id); idx = css->id; rcu_read_unlock(); return idx; } static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) { if (in_interrupt()) return; sock_cgroup_set_prioidx(skcd, task_netprioidx(current)); } #else /* !CONFIG_CGROUP_NET_PRIO */ static inline u32 task_netprioidx(struct task_struct *p) { return 0; } static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) { } #endif /* CONFIG_CGROUP_NET_PRIO */ #endif /* _NET_CLS_CGROUP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vsyscall #if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __VSYSCALL_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(emulate_vsyscall, TP_PROTO(int nr), TP_ARGS(nr), TP_STRUCT__entry(__field(int, nr)), TP_fast_assign( __entry->nr = nr; ), TP_printk("nr = %d", __entry->nr) ); #endif #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ #define TRACE_INCLUDE_FILE vsyscall_trace #include <trace/define_trace.h>
18 12 14 18 17 17 15 14 13 11 12 2 12 13 12 11 11 10 1 1 1 1 17 18 17 10 10 9 9 10 10 9 10 10 10 6 6 6 6 6 6 10 10 10 9 9 10 6 5 5 5 10 10 5 5 5 5 5 5 5 6 2 2 6 13 6 6 6 6 6 2 6 6 6 6 6 2 2 2 2 2 2 5 2 5 5 5 5 2 2 2 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 28 27 26 27 25 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 // SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/btf.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/ioport.h> #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> #include <linux/device.h> #include <linux/freezer.h> #include <linux/panic_notifier.h> #include <linux/pm.h> #include <linux/cpu.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/hash.h> #include "kexec_internal.h" atomic_t __kexec_lock = ATOMIC_INIT(0); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; bool kexec_file_dbg_print; /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors * where you can disable the MMU this is trivial, and easy. For * others it is still a simple predictable page table to setup. * * In that environment kexec copies the new kernel to its final * resting place. This means I can only support memory whose * physical address can fit in an unsigned long. In particular * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. * If the assembly stub has more restrictive requirements * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range * 0 - TASK_SIZE, as only the user space mappings are arbitrarily * modifiable. * * The assembly stub in the control code buffer is passed a linked list * of descriptor pages detailing the source pages of the new kernel, * and the destination addresses of those source pages. As this data * structure is not used in the context of the current OS, it must * be self-contained. * * The code has been made to work with highmem pages and will use a * destination page in its final resting place (if it happens * to allocate it). The end product of this is that most of the * physical address space, and most of RAM can be used. * * Future directions include: * - allocating a page table with the control code buffer identity * mapped, to simplify machine_kexec and make kexec_on_panic more * reliable. */ /* * KIMAGE_NO_DEST is an impossible destination address..., for * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); int sanity_check_segment_list(struct kimage *image) { int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; unsigned long nr_pages = totalram_pages(); /* * Verify we have good destination addresses. The caller is * responsible for making certain we don't attempt to load * the new image into invalid or reserved areas of RAM. This * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size * granularity. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if (mstart > mend) return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. * If we alloed overlapping destination addresses * through very weird things can happen with no * easy explanation as one segment stops on another. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; for (j = 0; j < i; j++) { unsigned long pstart, pend; pstart = image->segment[j].mem; pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) return -EINVAL; } } /* Ensure our buffer sizes are strictly less than * our memory sizes. This should always be the case, * and it is easier to check up front than to be surprised * later on. */ for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) return -EINVAL; } /* * Verify that no more than half of memory will be consumed. If the * request from userspace is too large, a large amount of time will be * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } if (total_pages > nr_pages / 2) return -EINVAL; #ifdef CONFIG_CRASH_DUMP /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't * attempt to load the new image into invalid or reserved * areas of RAM. But crash kernels are preloaded into a * reserved area of ram. We must ensure the addresses * are in the reserved area otherwise preloading the * kernel could corrupt things. */ if (image->type == KEXEC_TYPE_CRASH) { for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < phys_to_boot_phys(crashk_res.start)) || (mend > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } } #endif return 0; } struct kimage *do_kimage_alloc_init(void) { struct kimage *image; /* Allocate a controlling structure */ image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) return NULL; image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ image->type = KEXEC_TYPE_DEFAULT; /* Initialize the list of control pages */ INIT_LIST_HEAD(&image->control_pages); /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); #ifdef CONFIG_CRASH_HOTPLUG image->hp_action = KEXEC_CRASH_HP_NONE; image->elfcorehdr_index = -1; image->elfcorehdr_updated = false; #endif return image; } int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((end >= mstart) && (start <= mend)) return 1; } return 0; } static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; if (fatal_signal_pending(current)) return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); arch_kexec_post_alloc_pages(page_address(pages), count, gfp_mask); if (gfp_mask & __GFP_ZERO) for (i = 0; i < count; i++) clear_highpage(pages + i); } return pages; } static void kimage_free_pages(struct page *page) { unsigned int order, count, i; order = page_private(page); count = 1 << order; arch_kexec_pre_free_pages(page_address(page), count); for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); } void kimage_free_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } } static struct page *kimage_alloc_normal_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * At worst this runs in O(N) of the image size. */ struct list_head extra_pages; struct page *pages; unsigned int count; count = 1 << order; INIT_LIST_HEAD(&extra_pages); /* Loop while I can allocate a page and the page allocated * is a destination page. */ do { unsigned long pfn, epfn, addr, eaddr; pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = (epfn << PAGE_SHIFT) - 1; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); pages = NULL; } } while (!pages); if (pages) { /* Remember the allocated page... */ list_add(&pages->lru, &image->control_pages); /* Because the page is already in it's destination * location we will never allocate another page at * that address. Therefore kimage_alloc_pages * will not return it (again) and we don't need * to give it an entry in image->segment[]. */ } /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); return pages; } #ifdef CONFIG_CRASH_DUMP static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * Control pages are also the only pags we must allocate * when loading a crash kernel. All of the other pages * are specified by the segments and we just memcpy * into them directly. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * Given the low demand this implements a very simple * allocator that finds the first hole of the appropriate * size in the reserved memory region, and allocates all * of the memory up to and including the hole. */ unsigned long hole_start, hole_end, size; struct page *pages; pages = NULL; size = (1 << order) << PAGE_SHIFT; hole_start = ALIGN(image->control_page, size); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; cond_resched(); if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ hole_start = ALIGN(mend, size); hole_end = hole_start + size - 1; break; } } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); image->control_page = hole_end + 1; break; } } /* Ensure that these pages are decrypted if SME is enabled. */ if (pages) arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); return pages; } #endif struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { struct page *pages = NULL; switch (image->type) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; #endif } return pages; } static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) image->entry++; if (image->entry == image->last_entry) { kimage_entry_t *ind_page; struct page *page; page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); if (!page) return -ENOMEM; ind_page = page_address(page); *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); } *image->entry = entry; image->entry++; *image->entry = 0; return 0; } static int kimage_set_destination(struct kimage *image, unsigned long destination) { destination &= PAGE_MASK; return kimage_add_entry(image, destination | IND_DESTINATION); } static int kimage_add_page(struct kimage *image, unsigned long page) { page &= PAGE_MASK; return kimage_add_entry(image, page | IND_SOURCE); } static void kimage_free_extra_pages(struct kimage *image) { /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unusable_pages); } void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; } #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; kimage_entry_t ind = 0; if (!image) return; #ifdef CONFIG_CRASH_DUMP if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } #endif kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { /* Free the previous indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Save this indirection page until we are * done with it. */ ind = entry; } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Handle any machine specific cleanup */ machine_kexec_cleanup(image); /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. */ if (image->file_mode) kimage_file_post_load_cleanup(image); kfree(image); } static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) destination = entry & PAGE_MASK; else if (entry & IND_SOURCE) { if (page == destination) return ptr; destination += PAGE_SIZE; } } return NULL; } static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long destination) { /* * Here we implement safeguards to ensure that a source page * is not copied to its destination page before the data on * the destination page is no longer useful. * * To do this we maintain the invariant that a source page is * either its own destination page, or it is not a * destination page at all. * * That is slightly stronger than required, but the proof * that no problems will not occur is trivial, and the * implementation is simply to verify. * * When allocating all pages normally this algorithm will run * in O(N) time, but in the worst case it will run in O(N^2) * time. If the runtime is a problem the data structures can * be fixed. */ struct page *page; unsigned long addr; /* * Walk through the list of destination pages, and see if I * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; } } page = NULL; while (1) { kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ page = kimage_alloc_pages(gfp_mask, 0); if (!page) return NULL; /* If the page cannot be used file it away */ if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) break; /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE - 1)) break; /* * I know that the page is someones destination page. * See if there is already a source page for this * destination page. And if so swap the source pages. */ old = kimage_dst_used(image, addr); if (old) { /* If so move it */ unsigned long old_addr; struct page *old_page; old_addr = *old & PAGE_MASK; old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a * destination page, so return it if it's * gfp_flags honor the ones passed in. */ if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(old_page)) { kimage_free_pages(old_page); continue; } page = old_page; break; } /* Place the page on the destination list, to be used later */ list_add(&page->lru, &image->dest_pages); } return page; } static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; result = kimage_set_destination(image, maddr); if (result < 0) goto out; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); if (!page) { result = -ENOMEM; goto out; } result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kunmap_local(ptr); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #ifdef CONFIG_CRASH_DUMP static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } if (uchunk) { /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); ubytes -= uchunk; if (image->file_mode) kbuf += uchunk; else buf += uchunk; } kexec_flush_icache_page(page); kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; } maddr += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } #endif int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; #endif } return result; } struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; int limit; }; static struct kexec_load_limit load_limit_reboot = { .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex), .limit = -1, }; static struct kexec_load_limit load_limit_panic = { .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex), .limit = -1, }; struct kimage *kexec_image; struct kimage *kexec_crash_image; static int kexec_load_disabled; #ifdef CONFIG_SYSCTL static int kexec_limit_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct kexec_load_limit *limit = table->data; int val; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, }; int ret; if (write) { ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (ret) return ret; if (val < 0) return -EINVAL; mutex_lock(&limit->mutex); if (limit->limit != -1 && val >= limit->limit) ret = -EINVAL; else limit->limit = val; mutex_unlock(&limit->mutex); return ret; } mutex_lock(&limit->mutex); val = limit->limit; mutex_unlock(&limit->mutex); return proc_dointvec(&tmp, write, buffer, lenp, ppos); } static struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, .maxlen = sizeof(int), .mode = 0644, /* only handle a transition from default "0" to "1" */ .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, }, { .procname = "kexec_load_limit_panic", .data = &load_limit_panic, .mode = 0644, .proc_handler = kexec_limit_handler, }, { .procname = "kexec_load_limit_reboot", .data = &load_limit_reboot, .mode = 0644, .proc_handler = kexec_limit_handler, }, }; static int __init kexec_core_sysctl_init(void) { register_sysctl_init("kernel", kexec_core_sysctls); return 0; } late_initcall(kexec_core_sysctl_init); #endif bool kexec_load_permitted(int kexec_image_type) { struct kexec_load_limit *limit; /* * Only the superuser can use the kexec syscall and if it has not * been disabled. */ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return false; /* Check limit counter and decrease it.*/ limit = (kexec_image_type == KEXEC_TYPE_CRASH) ? &load_limit_panic : &load_limit_reboot; mutex_lock(&limit->mutex); if (!limit->limit) { mutex_unlock(&limit->mutex); return false; } if (limit->limit != -1) limit->limit--; mutex_unlock(&limit->mutex); return true; } /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { int error = 0; if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; goto Unlock; } #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { pm_prepare_console(); error = freeze_processes(); if (error) { error = -EBUSY; goto Restore_console; } suspend_console(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, * but *not* dpm_suspend_end(). We *must* call * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) goto Enable_irqs; } else #endif { kexec_in_progress = true; kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); syscore_shutdown(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that * no further code needs to use CPU hotplug (which is true in * the reboot case). However, the kexec path depends on using * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); pr_notice("Starting new kernel\n"); machine_shutdown(); } kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: resume_console(); thaw_processes(); Restore_console: pm_restore_console(); } #endif Unlock: kexec_unlock(); return error; }
2 3 3 10 10 10 21 21 9 9 20 21 21 21 21 20 21 21 20 21 21 21 19 21 21 19 18 19 19 18 20 20 20 15 15 15 15 3 12 12 10 6 6 6 6 5 6 6 6 6 6 6 6 6 6 6 6 6 6 3 3 13 12 13 1 13 13 13 13 1 13 12 12 12 12 11 6 6 6 6 6 6 6 6 6 2 2 2 2 1 1 1 1 1 1 2 12 2 2 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 10 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 // SPDX-License-Identifier: GPL-2.0-or-later /* * Forwarding database * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/unaligned.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <trace/events/bridge.h> #include "br_private.h" static const struct rhashtable_params br_fdb_rht_params = { .head_offset = offsetof(struct net_bridge_fdb_entry, rhnode), .key_offset = offsetof(struct net_bridge_fdb_entry, key), .key_len = sizeof(struct net_bridge_fdb_key), .automatic_shrinking = true, }; static struct kmem_cache *br_fdb_cache __read_mostly; int __init br_fdb_init(void) { br_fdb_cache = KMEM_CACHE(net_bridge_fdb_entry, SLAB_HWCACHE_ALIGN); if (!br_fdb_cache) return -ENOMEM; return 0; } void br_fdb_fini(void) { kmem_cache_destroy(br_fdb_cache); } int br_fdb_hash_init(struct net_bridge *br) { return rhashtable_init(&br->fdb_hash_tbl, &br_fdb_rht_params); } void br_fdb_hash_fini(struct net_bridge *br) { rhashtable_destroy(&br->fdb_hash_tbl); } /* if topology_changing then use forward_delay (default 15 sec) * otherwise keep longer (default 5 minutes) */ static inline unsigned long hold_time(const struct net_bridge *br) { return br->topology_change ? br->forward_delay : br->ageing_time; } static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { return !test_bit(BR_FDB_STATIC, &fdb->flags) && !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) && time_before_eq(fdb->updated + hold_time(br), jiffies); } static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return NUD_PERMANENT; else if (test_bit(BR_FDB_STATIC, &fdb->flags)) return NUD_NOARP; else if (has_expired(br, fdb)) return NUD_STALE; else return NUD_REACHABLE; } static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, u32 portid, u32 seq, int type, unsigned int flags) { const struct net_bridge_port *dst = READ_ONCE(fdb->dst); unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; u32 ext_flags = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = 0; ndm->ndm_type = 0; ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(br, fdb); if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) ndm->ndm_flags |= NTF_OFFLOADED; if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) ndm->ndm_flags |= NTF_EXT_LEARNED; if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; if (test_bit(BR_FDB_LOCKED, &fdb->flags)) ext_flags |= NTF_EXT_LOCKED; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) goto nla_put_failure; if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags)) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), &fdb->key.vlan_id)) goto nla_put_failure; if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); u8 notify_bits = FDB_NOTIFY_BIT; if (!nest) goto nla_put_failure; if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) notify_bits |= FDB_NOTIFY_INACTIVE_BIT; if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { nla_nest_cancel(skb, nest); goto nla_put_failure; } nla_nest_end(skb, nest); } nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t fdb_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type, bool swdev_notify) { struct net *net = dev_net(br->dev); struct sk_buff *skb; int err = -ENOBUFS; if (swdev_notify) br_switchdev_fdb_notify(br, fdb, type); skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_key key; WARN_ON_ONCE(!rcu_read_lock_held()); key.vlan_id = vid; memcpy(key.addr.addr, addr, sizeof(key.addr.addr)); return rhashtable_lookup(tbl, &key, br_fdb_rht_params); } /* requires bridge hash_lock */ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_entry *fdb; lockdep_assert_held_once(&br->hash_lock); rcu_read_lock(); fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); rcu_read_unlock(); return fdb; } struct net_device *br_fdb_find_port(const struct net_device *br_dev, const unsigned char *addr, __u16 vid) { struct net_bridge_fdb_entry *f; struct net_device *dev = NULL; struct net_bridge *br; ASSERT_RTNL(); if (!netif_is_bridge_master(br_dev)) return NULL; br = netdev_priv(br_dev); rcu_read_lock(); f = br_fdb_find_rcu(br, addr, vid); if (f && f->dst) dev = f->dst->dev; rcu_read_unlock(); return dev; } EXPORT_SYMBOL_GPL(br_fdb_find_port); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid) { return fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); } /* When a static FDB entry is added, the mac address from the entry is * added to the bridge private HW address list and all required ports * are then updated with the new information. * Called under RTNL. */ static void fdb_add_hw_addr(struct net_bridge *br, const unsigned char *addr) { int err; struct net_bridge_port *p; ASSERT_RTNL(); list_for_each_entry(p, &br->port_list, list) { if (!br_promisc_port(p)) { err = dev_uc_add(p->dev, addr); if (err) goto undo; } } return; undo: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!br_promisc_port(p)) dev_uc_del(p->dev, addr); } } /* When a static FDB entry is deleted, the HW address from that entry is * also removed from the bridge private HW address list and updates all * the ports with needed information. * Called under RTNL. */ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) { struct net_bridge_port *p; ASSERT_RTNL(); list_for_each_entry(p, &br->port_list, list) { if (!br_promisc_port(p)) dev_uc_del(p->dev, addr); } } static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, bool swdev_notify) { trace_fdb_delete(br, f); if (test_bit(BR_FDB_STATIC, &f->flags)) fdb_del_hw_addr(br, f->key.addr.addr); hlist_del_init_rcu(&f->fdb_node); rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode, br_fdb_rht_params); if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &f->flags)) atomic_dec(&br->fdb_n_learned); fdb_notify(br, f, RTM_DELNEIGH, swdev_notify); kfree_rcu(f, rcu); } /* Delete a local entry if no other port had the same address. * * This function should only be called on entries with BR_FDB_LOCAL set, * so even with BR_FDB_ADDED_BY_USER cleared we never need to increase * the accounting for dynamically learned entries again. */ static void fdb_delete_local(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_fdb_entry *f) { const unsigned char *addr = f->key.addr.addr; struct net_bridge_vlan_group *vg; const struct net_bridge_vlan *v; struct net_bridge_port *op; u16 vid = f->key.vlan_id; /* Maybe another port has same hw addr? */ list_for_each_entry(op, &br->port_list, list) { vg = nbp_vlan_group(op); if (op != p && ether_addr_equal(op->dev->dev_addr, addr) && (!vid || br_vlan_find(vg, vid))) { f->dst = op; clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } } vg = br_vlan_group(br); v = br_vlan_find(vg, vid); /* Maybe bridge device has same hw addr? */ if (p && ether_addr_equal(br->dev->dev_addr, addr) && (!vid || (v && br_vlan_should_use(v)))) { f->dst = NULL; clear_bit(BR_FDB_ADDED_BY_USER, &f->flags); return; } fdb_delete(br, f, true); } void br_fdb_find_delete_local(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *f; spin_lock_bh(&br->hash_lock); f = br_fdb_find(br, addr, vid); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p) fdb_delete_local(br, p, f); spin_unlock_bh(&br->hash_lock); } static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, __u16 vid, unsigned long flags) { bool learned = !test_bit(BR_FDB_ADDED_BY_USER, &flags) && !test_bit(BR_FDB_LOCAL, &flags); u32 max_learned = READ_ONCE(br->fdb_max_learned); struct net_bridge_fdb_entry *fdb; int err; if (likely(learned)) { int n_learned = atomic_read(&br->fdb_n_learned); if (unlikely(max_learned && n_learned >= max_learned)) return NULL; __set_bit(BR_FDB_DYNAMIC_LEARNED, &flags); } fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); if (!fdb) return NULL; memcpy(fdb->key.addr.addr, addr, ETH_ALEN); WRITE_ONCE(fdb->dst, source); fdb->key.vlan_id = vid; fdb->flags = flags; fdb->updated = fdb->used = jiffies; err = rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, br_fdb_rht_params); if (err) { kmem_cache_free(br_fdb_cache, fdb); return NULL; } if (likely(learned)) atomic_inc(&br->fdb_n_learned); hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list); return fdb; } static int fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { struct net_bridge_fdb_entry *fdb; if (!is_valid_ether_addr(addr)) return -EINVAL; fdb = br_fdb_find(br, addr, vid); if (fdb) { /* it is okay to have multiple ports with same * address, just use the first one. */ if (test_bit(BR_FDB_LOCAL, &fdb->flags)) return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); fdb_delete(br, fdb, true); } fdb = fdb_create(br, source, addr, vid, BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); if (!fdb) return -ENOMEM; fdb_add_hw_addr(br, addr); fdb_notify(br, fdb, RTM_NEWNEIGH, true); return 0; } void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); hlist_for_each_entry(f, &br->fdb_list, fdb_node) { if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) { /* delete old one */ fdb_delete_local(br, p, f); /* if this port has no vlan information * configured, we can safely be done at * this point. */ if (!vg || !vg->num_vlans) goto insert; } } insert: /* insert new address, may fail if invalid address or dup. */ fdb_add_local(br, p, newaddr, 0); if (!vg || !vg->num_vlans) goto done; /* Now add entries for every VLAN configured on the port. * This function runs under RTNL so the bitmap will not change * from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) fdb_add_local(br, p, newaddr, v->vid); done: spin_unlock_bh(&br->hash_lock); } void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) { struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge_vlan *v; spin_lock_bh(&br->hash_lock); /* If old entry was unassociated with any port, then delete it. */ f = br_fdb_find(br, br->dev->dev_addr, 0); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_add_local(br, NULL, newaddr, 0); vg = br_vlan_group(br); if (!vg || !vg->num_vlans) goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not * change from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; f = br_fdb_find(br, br->dev->dev_addr, v->vid); if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); fdb_add_local(br, NULL, newaddr, v->vid); } out: spin_unlock_bh(&br->hash_lock); } void br_fdb_cleanup(struct work_struct *work) { struct net_bridge *br = container_of(work, struct net_bridge, gc_work.work); struct net_bridge_fdb_entry *f = NULL; unsigned long delay = hold_time(br); unsigned long work_delay = delay; unsigned long now = jiffies; /* this part is tricky, in order to avoid blocking learning and * consequently forwarding, we rely on rcu to delete objects with * delayed freeing allowing us to continue traversing */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { unsigned long this_timer = f->updated + delay; if (test_bit(BR_FDB_STATIC, &f->flags) || test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) { if (test_bit(BR_FDB_NOTIFY, &f->flags)) { if (time_after(this_timer, now)) work_delay = min(work_delay, this_timer - now); else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &f->flags)) fdb_notify(br, f, RTM_NEWNEIGH, false); } continue; } if (time_after(this_timer, now)) { work_delay = min(work_delay, this_timer - now); } else { spin_lock_bh(&br->hash_lock); if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f, true); spin_unlock_bh(&br->hash_lock); } } rcu_read_unlock(); /* Cleanup minimum 10 milliseconds apart */ work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10)); mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) { const struct net_bridge_port *dst = READ_ONCE(f->dst); int port_ifidx = dst ? dst->dev->ifindex : br->dev->ifindex; if (desc->vlan_id && desc->vlan_id != f->key.vlan_id) return false; if (desc->port_ifindex && desc->port_ifindex != port_ifidx) return false; if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) return false; return true; } /* Flush forwarding database entries matching the description */ void br_fdb_flush(struct net_bridge *br, const struct net_bridge_fdb_flush_desc *desc) { struct net_bridge_fdb_entry *f; rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (!__fdb_flush_matches(br, f, desc)) continue; spin_lock_bh(&br->hash_lock); if (!hlist_unhashed(&f->fdb_node)) fdb_delete(br, f, true); spin_unlock_bh(&br->hash_lock); } rcu_read_unlock(); } static unsigned long __ndm_state_to_fdb_flags(u16 ndm_state) { unsigned long flags = 0; if (ndm_state & NUD_PERMANENT) __set_bit(BR_FDB_LOCAL, &flags); if (ndm_state & NUD_NOARP) __set_bit(BR_FDB_STATIC, &flags); return flags; } static unsigned long __ndm_flags_to_fdb_flags(u8 ndm_flags) { unsigned long flags = 0; if (ndm_flags & NTF_USE) __set_bit(BR_FDB_ADDED_BY_USER, &flags); if (ndm_flags & NTF_EXT_LEARNED) __set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &flags); if (ndm_flags & NTF_OFFLOADED) __set_bit(BR_FDB_OFFLOADED, &flags); if (ndm_flags & NTF_STICKY) __set_bit(BR_FDB_STICKY, &flags); return flags; } static int __fdb_flush_validate_ifindex(const struct net_bridge *br, int ifindex, struct netlink_ext_ack *extack) { const struct net_device *dev; dev = __dev_get_by_index(dev_net(br->dev), ifindex); if (!dev) { NL_SET_ERR_MSG_MOD(extack, "Unknown flush device ifindex"); return -ENODEV; } if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { NL_SET_ERR_MSG_MOD(extack, "Flush device is not a bridge or bridge port"); return -EINVAL; } if (netif_is_bridge_master(dev) && dev != br->dev) { NL_SET_ERR_MSG_MOD(extack, "Flush bridge device does not match target bridge device"); return -EINVAL; } if (netif_is_bridge_port(dev)) { struct net_bridge_port *p = br_port_get_rtnl(dev); if (p->br != br) { NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); return -EINVAL; } } return 0; } static const struct nla_policy br_fdb_del_bulk_policy[NDA_MAX + 1] = { [NDA_VLAN] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2), [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, }; int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack) { struct net_bridge_fdb_flush_desc desc = {}; struct ndmsg *ndm = nlmsg_data(nlh); struct net_bridge_port *p = NULL; struct nlattr *tb[NDA_MAX + 1]; struct net_bridge *br; u8 ndm_flags; int err; ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, br_fdb_del_bulk_policy, extack); if (err) return err; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); } else { p = br_port_get_rtnl(dev); if (!p) { NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge port"); return -EINVAL; } br = p->br; } if (tb[NDA_VLAN]) desc.vlan_id = nla_get_u16(tb[NDA_VLAN]); if (ndm_flags & ~FDB_FLUSH_ALLOWED_NDM_FLAGS) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); return -EINVAL; } if (ndm->ndm_state & ~FDB_FLUSH_ALLOWED_NDM_STATES) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); return -EINVAL; } desc.flags |= __ndm_state_to_fdb_flags(ndm->ndm_state); desc.flags |= __ndm_flags_to_fdb_flags(ndm_flags); if (tb[NDA_NDM_STATE_MASK]) { u16 ndm_state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); desc.flags_mask |= __ndm_state_to_fdb_flags(ndm_state_mask); } if (tb[NDA_NDM_FLAGS_MASK]) { u8 ndm_flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); desc.flags_mask |= __ndm_flags_to_fdb_flags(ndm_flags_mask); } if (tb[NDA_IFINDEX]) { int ifidx = nla_get_s32(tb[NDA_IFINDEX]); err = __fdb_flush_validate_ifindex(br, ifidx, extack); if (err) return err; desc.port_ifindex = ifidx; } else if (p) { /* flush was invoked with port device and NTF_MASTER */ desc.port_ifindex = p->dev->ifindex; } br_debug(br, "flushing port ifindex: %d vlan id: %u flags: 0x%lx flags mask: 0x%lx\n", desc.port_ifindex, desc.vlan_id, desc.flags, desc.flags_mask); br_fdb_flush(br, &desc); return 0; } /* Flush all entries referring to a specific port. * if do_all is set also flush static entries * if vid is set delete all entries that match the vlan_id */ void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all) { struct net_bridge_fdb_entry *f; struct hlist_node *tmp; spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { if (f->dst != p) continue; if (!do_all) if (test_bit(BR_FDB_STATIC, &f->flags) || (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags) && !test_bit(BR_FDB_OFFLOADED, &f->flags)) || (vid && f->key.vlan_id != vid)) continue; if (test_bit(BR_FDB_LOCAL, &f->flags)) fdb_delete_local(br, p, f); else fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); } #if IS_ENABLED(CONFIG_ATM_LANE) /* Interface used by ATM LANE hook to test * if an addr is on some other bridge port */ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr) { struct net_bridge_fdb_entry *fdb; struct net_bridge_port *port; int ret; rcu_read_lock(); port = br_port_get_rcu(dev); if (!port) ret = 0; else { const struct net_bridge_port *dst = NULL; fdb = br_fdb_find_rcu(port->br, addr, 0); if (fdb) dst = READ_ONCE(fdb->dst); ret = dst && dst->dev != dev && dst->state == BR_STATE_FORWARDING; } rcu_read_unlock(); return ret; } #endif /* CONFIG_ATM_LANE */ /* * Fill buffer with forwarding table records in * the API format. */ int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long maxnum, unsigned long skip) { struct net_bridge_fdb_entry *f; struct __fdb_entry *fe = buf; int num = 0; memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (num >= maxnum) break; if (has_expired(br, f)) continue; /* ignore pseudo entry for local MAC address */ if (!f->dst) continue; if (skip) { --skip; continue; } /* convert from internal format to API */ memcpy(fe->mac_addr, f->key.addr.addr, ETH_ALEN); /* due to ABI compat need to split into hi/lo */ fe->port_no = f->dst->port_no; fe->port_hi = f->dst->port_no >> 8; fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags); if (!test_bit(BR_FDB_STATIC, &f->flags)) fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated); ++fe; ++num; } rcu_read_unlock(); return num; } /* Add entry for local address of interface */ int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid) { int ret; spin_lock_bh(&br->hash_lock); ret = fdb_add_local(br, source, addr, vid); spin_unlock_bh(&br->hash_lock); return ret; } /* returns true if the fdb was modified */ static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb) { return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) && test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)); } void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags) { struct net_bridge_fdb_entry *fdb; /* some users want to always flood. */ if (hold_time(br) == 0) return; fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid); if (likely(fdb)) { /* attempt to update an entry for a local interface */ if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) { if (net_ratelimit()) br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n", source->dev->name, addr, vid); } else { unsigned long now = jiffies; bool fdb_modified = false; if (now != fdb->updated) { fdb->updated = now; fdb_modified = __fdb_mark_active(fdb); } /* fastpath: update of existing entry */ if (unlikely(source != READ_ONCE(fdb->dst) && !test_bit(BR_FDB_STICKY, &fdb->flags))) { br_switchdev_fdb_notify(br, fdb, RTM_DELNEIGH); WRITE_ONCE(fdb->dst, source); fdb_modified = true; /* Take over HW learned entry */ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))) clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); /* Clear locked flag when roaming to an * unlocked port. */ if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags))) clear_bit(BR_FDB_LOCKED, &fdb->flags); } if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) { set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &fdb->flags)) atomic_dec(&br->fdb_n_learned); } if (unlikely(fdb_modified)) { trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { spin_lock(&br->hash_lock); fdb = fdb_create(br, source, addr, vid, flags); if (fdb) { trace_br_fdb_update(br, source, addr, vid, flags); fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts * it first, don't bother updating */ spin_unlock(&br->hash_lock); } } /* Dump information about entries, in response to GETNEIGH */ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; if (!netif_is_bridge_master(dev)) return err; if (!filter_dev) { err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); if (err < 0) return err; } rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { if (*idx < cb->args[2]) goto skip; if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { if (filter_dev != dev) goto skip; /* !f->dst is a special case for bridge * It means the MAC belongs to the bridge * Therefore need a little more filtering * we only want to dump the !f->dst case */ if (f->dst) goto skip; } if (!filter_dev && f->dst) goto skip; err = fdb_fill_info(skb, br, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI); if (err < 0) break; skip: *idx += 1; } rcu_read_unlock(); return err; } int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; rcu_read_lock(); f = br_fdb_find_rcu(br, addr, vid); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; goto errout; } err = fdb_fill_info(skb, br, f, portid, seq, RTM_NEWNEIGH, 0); errout: rcu_read_unlock(); return err; } /* returns true if the fdb is modified */ static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify) { bool modified = false; /* allow to mark an entry as inactive, usually done on creation */ if ((notify & FDB_NOTIFY_INACTIVE_BIT) && !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) modified = true; if ((notify & FDB_NOTIFY_BIT) && !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) { /* enabled activity tracking */ modified = true; } else if (!(notify & FDB_NOTIFY_BIT) && test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) { /* disabled activity tracking, clear notify state */ clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags); modified = true; } return modified; } /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid, struct nlattr *nfea_tb[]) { bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY); bool refresh = !nfea_tb[NFEA_DONT_REFRESH]; struct net_bridge_fdb_entry *fdb; u16 state = ndm->ndm_state; bool modified = false; u8 notify = 0; /* If the port cannot learn allow only local and static entries */ if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) && !(source->state == BR_STATE_LEARNING || source->state == BR_STATE_FORWARDING)) return -EPERM; if (!source && !(state & NUD_PERMANENT)) { pr_info("bridge: RTM_NEWNEIGH %s without NUD_PERMANENT\n", br->dev->name); return -EINVAL; } if (is_sticky && (state & NUD_PERMANENT)) return -EINVAL; if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) { notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]); if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) || (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT) return -EINVAL; } fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) return -ENOENT; fdb = fdb_create(br, source, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); if (!fdb) return -ENOMEM; modified = true; } else { if (flags & NLM_F_EXCL) return -EEXIST; if (READ_ONCE(fdb->dst) != source) { WRITE_ONCE(fdb->dst, source); modified = true; } set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &fdb->flags)) atomic_dec(&br->fdb_n_learned); } if (fdb_to_nud(br, fdb) != state) { if (state & NUD_PERMANENT) { set_bit(BR_FDB_LOCAL, &fdb->flags); if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); } else if (state & NUD_NOARP) { clear_bit(BR_FDB_LOCAL, &fdb->flags); if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags)) fdb_add_hw_addr(br, addr); } else { clear_bit(BR_FDB_LOCAL, &fdb->flags); if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags)) fdb_del_hw_addr(br, addr); } modified = true; } if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) { change_bit(BR_FDB_STICKY, &fdb->flags); modified = true; } if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags)) modified = true; if (fdb_handle_notify(fdb, notify)) modified = true; fdb->used = jiffies; if (modified) { if (refresh) fdb->updated = jiffies; fdb_notify(br, fdb, RTM_NEWNEIGH, true); } return 0; } static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[], bool *notified, struct netlink_ext_ack *extack) { int err = 0; if (ndm->ndm_flags & NTF_USE) { if (!p) { pr_info("bridge: RTM_NEWNEIGH %s with NTF_USE is not supported\n", br->dev->name); return -EINVAL; } if (!nbp_state_should_learn(p)) return 0; local_bh_disable(); rcu_read_lock(); br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER)); rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { if (!p && !(ndm->ndm_state & NUD_PERMANENT)) { NL_SET_ERR_MSG_MOD(extack, "FDB entry towards bridge must be permanent"); return -EINVAL; } err = br_fdb_external_learn_add(br, p, addr, vid, false, true); } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); spin_unlock_bh(&br->hash_lock); } if (!err) *notified = true; return err; } static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 }, [NFEA_DONT_REFRESH] = { .type = NLA_FLAG }, }; /* Add new permanent fdb entry with RTM_NEWNEIGH */ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, bool *notified, struct netlink_ext_ack *extack) { struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; struct net_bridge *br = NULL; u32 ext_flags = 0; int err = 0; trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags); if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) { pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } if (is_zero_ether_addr(addr)) { pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n"); return -EINVAL; } if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (!p) { pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", dev->name); return -EINVAL; } br = p->br; vg = nbp_vlan_group(p); } if (tb[NDA_FLAGS_EXT]) ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]); if (ext_flags & NTF_EXT_LOCKED) { NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set"); return -EINVAL; } if (tb[NDA_FDB_EXT_ATTRS]) { attr = tb[NDA_FDB_EXT_ATTRS]; err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, br_nda_fdb_pol, extack); if (err) return err; } else { memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1)); } if (vid) { v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { pr_info("bridge: RTM_NEWNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); return -EINVAL; } /* VID was specified, so use it. */ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb, notified, extack); } else { err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb, notified, extack); if (err || !vg || !vg->num_vlans) goto out; /* We have vlans configured on this port and user didn't * specify a VLAN. To be nice, add/update entry for every * vlan on this port. */ list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, nfea_tb, notified, extack); if (err) goto out; } } out: return err; } static int fdb_delete_by_addr_and_port(struct net_bridge *br, const struct net_bridge_port *p, const u8 *addr, u16 vlan, bool *notified) { struct net_bridge_fdb_entry *fdb; fdb = br_fdb_find(br, addr, vlan); if (!fdb || READ_ONCE(fdb->dst) != p) return -ENOENT; fdb_delete(br, fdb, true); *notified = true; return 0; } static int __br_fdb_delete(struct net_bridge *br, const struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool *notified) { int err; spin_lock_bh(&br->hash_lock); err = fdb_delete_by_addr_and_port(br, p, addr, vid, notified); spin_unlock_bh(&br->hash_lock); return err; } /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; struct net_bridge *br; int err; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (!p) { pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", dev->name); return -EINVAL; } vg = nbp_vlan_group(p); br = p->br; } if (vid) { err = __br_fdb_delete(br, p, addr, vid, notified); } else { struct net_bridge_vlan *v; err = -ENOENT; err &= __br_fdb_delete(br, p, addr, 0, notified); if (!vg || !vg->num_vlans) return err; list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; err &= __br_fdb_delete(br, p, addr, v->vid, notified); } } return err; } int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p) { struct net_bridge_fdb_entry *f, *tmp; int err = 0; ASSERT_RTNL(); /* the key here is that static entries change only under rtnl */ rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; err = dev_uc_add(p->dev, f->key.addr.addr); if (err) goto rollback; } done: rcu_read_unlock(); return err; rollback: hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &tmp->flags)) continue; if (tmp == f) break; dev_uc_del(p->dev, tmp->key.addr.addr); } goto done; } void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) { struct net_bridge_fdb_entry *f; ASSERT_RTNL(); rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { /* We only care for static entries */ if (!test_bit(BR_FDB_STATIC, &f->flags)) continue; dev_uc_del(p->dev, f->key.addr.addr); } rcu_read_unlock(); } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool locked, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; bool modified = false; int err = 0; trace_br_fdb_external_learn_add(br, p, addr, vid); if (locked && (!p || !(p->flags & BR_PORT_MAB))) return -EINVAL; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (!fdb) { unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN); if (swdev_notify) flags |= BIT(BR_FDB_ADDED_BY_USER); if (!p) flags |= BIT(BR_FDB_LOCAL); if (locked) flags |= BIT(BR_FDB_LOCKED); fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; goto err_unlock; } fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { if (locked && (!test_bit(BR_FDB_LOCKED, &fdb->flags) || READ_ONCE(fdb->dst) != p)) { err = -EINVAL; goto err_unlock; } fdb->updated = jiffies; if (READ_ONCE(fdb->dst) != p) { WRITE_ONCE(fdb->dst, p); modified = true; } if (test_and_set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; } else { modified = true; } if (locked != test_bit(BR_FDB_LOCKED, &fdb->flags)) { change_bit(BR_FDB_LOCKED, &fdb->flags); modified = true; } if (swdev_notify) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); if (!p) set_bit(BR_FDB_LOCAL, &fdb->flags); if ((swdev_notify || !p) && test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &fdb->flags)) atomic_dec(&br->fdb_n_learned); if (modified) fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } err_unlock: spin_unlock_bh(&br->hash_lock); return err; } int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; int err = 0; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; spin_unlock_bh(&br->hash_lock); return err; } void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool offloaded) { struct net_bridge_fdb_entry *fdb; spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags)) change_bit(BR_FDB_OFFLOADED, &fdb->flags); spin_unlock_bh(&br->hash_lock); } void br_fdb_clear_offload(const struct net_device *dev, u16 vid) { struct net_bridge_fdb_entry *f; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_rtnl(dev); if (!p) return; spin_lock_bh(&p->br->hash_lock); hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) { if (f->dst == p && f->key.vlan_id == vid) clear_bit(BR_FDB_OFFLOADED, &f->flags); } spin_unlock_bh(&p->br->hash_lock); } EXPORT_SYMBOL_GPL(br_fdb_clear_offload);
153 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/posix_acl.h (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H #include <linux/bug.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <uapi/linux/posix_acl.h> struct user_namespace; struct posix_acl_entry { short e_tag; unsigned short e_perm; union { kuid_t e_uid; kgid_t e_gid; }; }; struct posix_acl { refcount_t a_refcount; unsigned int a_count; struct rcu_head a_rcu; struct posix_acl_entry a_entries[] __counted_by(a_count); }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++) /* * Duplicate an ACL handle. */ static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) { if (acl) refcount_inc(&acl->a_refcount); return acl; } /* * Free an ACL handle. */ static inline void posix_acl_release(struct posix_acl *acl) { if (acl && refcount_dec_and_test(&acl->a_refcount)) kfree_rcu(acl, a_rcu); } /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) { inode->i_acl = NULL; inode->i_default_acl = NULL; } int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size); #else static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; } #define simple_set_acl NULL static inline int simple_acl_create(struct inode *dir, struct inode *inode) { return 0; } static inline void cache_no_acl(struct inode *inode) { } static inline int posix_acl_create(struct inode *inode, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { *default_acl = *acl = NULL; return 0; } static inline void forget_all_cached_acls(struct inode *inode) { } static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; } static inline int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { return 0; } #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); #endif /* __LINUX_POSIX_ACL_H */
2 2 3 3 1 3 51 3 51 49 48 47 8 39 36 28 41 40 29 50 33 1 33 33 32 2 3 28 24 32 3 2 29 30 27 33 1 4 2 1 4 2 4 2 1 2 2 1 1 1 1 1 1 2 3 3 2 2 2 2 2 2 2 3 1 1 1 4 4 1 4 1 3 3 3 26 26 27 11 11 11 11 3 2 2 1 3 6 6 5 5 6 9 8 1 7 8 9 3 2 2 6 6 1 1 2 9 9 2 7 9 2 2 2 2 2 1 1 1 1 1 1 3 3 1 3 3 7 6 2 4 6 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> #include <linux/perf_event.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { int i; for (i = 0; i < array->map.max_entries; i++) { free_percpu(array->pptrs[i]); cond_resched(); } } static int bpf_array_alloc_percpu(struct bpf_array *array) { void __percpu *ptr; int i; for (i = 0; i < array->map.max_entries; i++) { ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, GFP_USER | __GFP_NOWARN); if (!ptr) { bpf_array_free_percpu(array); return -ENOMEM; } array->pptrs[i] = ptr; cond_resched(); } return 0; } /* Called from syscall */ int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; /* avoid overflow on round_up(map->value_size) */ if (attr->value_size > INT_MAX) return -E2BIG; /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */ if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE) return -E2BIG; return 0; } static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool bypass_spec_v1 = bpf_bypass_spec_v1(NULL); u64 array_size, mask64; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); max_entries = attr->max_entries; /* On 32 bit archs roundup_pow_of_two() with max_entries that has * upper most bit set in u32 space is undefined behavior due to * resulting 1U << 32, so do it manually here in u64 space. */ mask64 = fls_long(max_entries - 1); mask64 = 1ULL << mask64; mask64 -= 1; index_mask = mask64; if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ max_entries = index_mask + 1; /* Check for overflows. */ if (max_entries < attr->max_entries) return ERR_PTR(-E2BIG); } array_size = sizeof(*array); if (percpu) { array_size += (u64) max_entries * sizeof(void *); } else { /* rely on vmalloc() to return page-aligned memory and * ensure array->value is exactly page-aligned */ if (attr->map_flags & BPF_F_MMAPABLE) { array_size = PAGE_ALIGN(array_size); array_size += PAGE_ALIGN((u64) max_entries * elem_size); } else { array_size += (u64) max_entries * elem_size; } } /* allocate all map elements and zero-initialize them */ if (attr->map_flags & BPF_F_MMAPABLE) { void *data; /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ data = bpf_map_area_mmapable_alloc(array_size, numa_node); if (!data) return ERR_PTR(-ENOMEM); array = data + PAGE_ALIGN(sizeof(struct bpf_array)) - offsetof(struct bpf_array, value); } else { array = bpf_map_area_alloc(array_size, numa_node); } if (!array) return ERR_PTR(-ENOMEM); array->index_mask = index_mask; array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } return &array->map; } static void *array_map_elem_ptr(struct bpf_array* array, u32 index) { return array->value + (u64)array->elem_size * index; } /* Called from syscall or from eBPF program */ static void *array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + (u64)array->elem_size * (index & array->index_mask); } static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { struct bpf_array *array = container_of(map, struct bpf_array, map); if (map->max_entries != 1) return -ENOTSUPP; if (off >= map->value_size) return -EINVAL; *imm = (unsigned long)array->value; return 0; } static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, u32 *off) { struct bpf_array *array = container_of(map, struct bpf_array, map); u64 base = (unsigned long)array->value; u64 range = array->elem_size; if (map->max_entries != 1) return -ENOTSUPP; if (imm < base || imm >= base + range) return -ENOENT; *off = imm - base; return 0; } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = array->elem_size; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; if (map->map_flags & BPF_F_INNER_MAP) return -EOPNOTSUPP; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); } else { *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); } *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } /* Called from eBPF program */ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return this_cpu_ptr(array->pptrs[index & array->index_mask]); } /* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */ static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; if (!bpf_jit_supports_percpu_insn()) return -EOPNOTSUPP; if (map->map_flags & BPF_F_INNER_MAP) return -EOPNOTSUPP; BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs)); *insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5); } *insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); *insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(BPF_REG_0, 0); return insn - insn_buf; } static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; if (cpu >= nr_cpu_ids) return NULL; if (unlikely(index >= array->map.max_entries)) return NULL; return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(index >= array->map.max_entries)) return -ENOENT; /* per_cpu areas are zero-filled and bpf programs can only * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } rcu_read_unlock(); return 0; } /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Called from syscall or from eBPF program */ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; char *val; if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); if (map_flags & BPF_F_LOCK) copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); bpf_obj_free_fields(array->map.record, val); } return 0; } int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu, off = 0; u32 size; if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; /* the user space will provide round_up(value_size, 8) bytes that * will be copied into per-cpu area. bpf programs can only access * value_size of it. During lookup the same extra bytes will be * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); return 0; } /* Called from syscall or from eBPF program */ static long array_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } static void *array_map_vmalloc_addr(struct bpf_array *array) { return (void *)round_down((unsigned long)array, PAGE_SIZE); } static void array_map_free_timers_wq(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); } } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; if (!IS_ERR_OR_NULL(map->record)) { if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { for (i = 0; i < array->map.max_entries; i++) { void __percpu *pptr = array->pptrs[i & array->index_mask]; int cpu; for_each_possible_cpu(cpu) { bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } } else { for (i = 0; i < array->map.max_entries; i++) bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i)); } } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) bpf_array_free_percpu(array); if (array->map.map_flags & BPF_F_MMAPABLE) bpf_map_area_free(array_map_vmalloc_addr(array)); else bpf_map_area_free(array); } static void array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void *value; rcu_read_lock(); value = array_map_lookup_elem(map, key); if (!value) { rcu_read_unlock(); return; } if (map->btf_key_type_id) seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_putc(m, '\n'); rcu_read_unlock(); } static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; int cpu; rcu_read_lock(); seq_printf(m, "%u: {\n", *(u32 *)key); pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { seq_printf(m, "\tcpu%d: ", cpu); btf_type_seq_show(map->btf, map->btf_value_type_id, per_cpu_ptr(pptr, cpu), m); seq_putc(m, '\n'); } seq_puts(m, "}\n"); rcu_read_unlock(); } static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { u32 int_data; /* One exception for keyless BTF: .bss/.data/.rodata map */ if (btf_type_is_void(key_type)) { if (map->map_type != BPF_MAP_TYPE_ARRAY || map->max_entries != 1) return -EINVAL; if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) return -EINVAL; return 0; } if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; int_data = *(u32 *)(key_type + 1); /* bpf array can only take a u32 key. This check makes sure * that the btf matches the attr used during map_create. */ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) return -EINVAL; return 0; } static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_array *array = container_of(map, struct bpf_array, map); pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT; if (!(map->map_flags & BPF_F_MMAPABLE)) return -EINVAL; if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) return -EINVAL; return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), vma->vm_pgoff + pgoff); } static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { if (!bpf_map_meta_equal(meta0, meta1)) return false; return meta0->map_flags & BPF_F_INNER_MAP ? true : meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { struct bpf_map *map; void *percpu_value_buf; u32 index; }; static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_map *map = info->map; struct bpf_array *array; u32 index; if (info->index >= map->max_entries) return NULL; if (*pos == 0) ++*pos; array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_map *map = info->map; struct bpf_array *array; u32 index; ++*pos; ++info->index; if (info->index >= map->max_entries) return NULL; array = container_of(map, struct bpf_array, map); index = info->index & array->index_mask; if (info->percpu_value_buf) return (void *)(uintptr_t)array->pptrs[index]; return array_map_elem_ptr(array, index); } static int __bpf_array_map_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_seq_array_map_info *info = seq->private; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_iter_meta meta; struct bpf_prog *prog; int off = 0, cpu = 0; void __percpu *pptr; u32 size; meta.seq = seq; prog = bpf_iter_get_info(&meta, v == NULL); if (!prog) return 0; ctx.meta = &meta; ctx.map = info->map; if (v) { ctx.key = &info->index; if (!info->percpu_value_buf) { ctx.value = v; } else { pptr = (void __percpu *)(uintptr_t)v; size = array->elem_size; for_each_possible_cpu(cpu) { copy_map_value_long(map, info->percpu_value_buf + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, info->percpu_value_buf + off); off += size; } ctx.value = info->percpu_value_buf; } } return bpf_iter_run_prog(prog, &ctx); } static int bpf_array_map_seq_show(struct seq_file *seq, void *v) { return __bpf_array_map_seq_show(seq, v); } static void bpf_array_map_seq_stop(struct seq_file *seq, void *v) { if (!v) (void)__bpf_array_map_seq_show(seq, NULL); } static int bpf_iter_init_array_map(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; struct bpf_map *map = aux->map; struct bpf_array *array = container_of(map, struct bpf_array, map); void *value_buf; u32 buf_size; if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { buf_size = array->elem_size * num_possible_cpus(); value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN); if (!value_buf) return -ENOMEM; seq_info->percpu_value_buf = value_buf; } /* bpf_iter_attach_map() acquires a map uref, and the uref may be * released before or in the middle of iterating map elements, so * acquire an extra map uref for iterator. */ bpf_map_inc_with_uref(map); seq_info->map = map; return 0; } static void bpf_iter_fini_array_map(void *priv_data) { struct bpf_iter_seq_array_map_info *seq_info = priv_data; bpf_map_put_with_uref(seq_info->map); kfree(seq_info->percpu_value_buf); } static const struct seq_operations bpf_array_map_seq_ops = { .start = bpf_array_map_seq_start, .next = bpf_array_map_seq_next, .stop = bpf_array_map_seq_stop, .show = bpf_array_map_seq_show, }; static const struct bpf_iter_seq_info iter_seq_info = { .seq_ops = &bpf_array_map_seq_ops, .init_seq_private = bpf_iter_init_array_map, .fini_seq_private = bpf_iter_fini_array_map, .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags) { u32 i, key, num_elems = 0; struct bpf_array *array; bool is_percpu; u64 ret = 0; void *val; if (flags != 0) return -EINVAL; is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; array = container_of(map, struct bpf_array, map); if (is_percpu) migrate_disable(); for (i = 0; i < map->max_entries; i++) { if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); else val = array_map_elem_ptr(array, i); num_elems++; key = i; ret = callback_fn((u64)(long)map, (u64)(long)&key, (u64)(long)val, (u64)(long)callback_ctx, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) break; } if (is_percpu) migrate_enable(); return num_elems; } static u64 array_map_mem_usage(const struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; u32 elem_size = array->elem_size; u64 entries = map->max_entries; u64 usage = sizeof(*array); if (percpu) { usage += entries * sizeof(void *); usage += entries * elem_size * num_possible_cpus(); } else { if (map->map_flags & BPF_F_MMAPABLE) { usage = PAGE_ALIGN(usage); usage += PAGE_ALIGN(entries * elem_size); } else { usage += entries * elem_size; } } return usage; } BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array) const struct bpf_map_ops array_map_ops = { .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_release_uref = array_map_free_timers_wq, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, .map_direct_value_addr = array_map_direct_value_addr, .map_direct_value_meta = array_map_direct_value_meta, .map_mmap = array_map_mmap, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; const struct bpf_map_ops percpu_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = percpu_array_map_lookup_elem, .map_gen_lookup = percpu_array_map_gen_lookup, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_array_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, }; static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; /* Program read-only/write-only not supported for special maps yet. */ if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) return -EINVAL; return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; /* make sure it's empty */ for (i = 0; i < array->map.max_entries; i++) BUG_ON(array->ptrs[i] != NULL); bpf_map_area_free(array); } static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } /* only called from syscall */ int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) { void **elem, *ptr; int ret = 0; if (!map->ops->map_fd_sys_lookup_elem) return -ENOTSUPP; rcu_read_lock(); elem = array_map_lookup_elem(map, key); if (elem && (ptr = READ_ONCE(*elem))) *value = map->ops->map_fd_sys_lookup_elem(ptr); else ret = -ENOENT; rcu_read_unlock(); return ret; } /* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; u32 index = *(u32 *)key, ufd; if (map_flags != BPF_ANY) return -EINVAL; if (index >= array->map.max_entries) return -E2BIG; ufd = *(u32 *)value; new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); if (map->ops->map_poke_run) { mutex_lock(&array->aux->poke_mutex); old_ptr = xchg(array->ptrs + index, new_ptr); map->ops->map_poke_run(map, index, old_ptr, new_ptr); mutex_unlock(&array->aux->poke_mutex); } else { old_ptr = xchg(array->ptrs + index, new_ptr); } if (old_ptr) map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; u32 index = *(u32 *)key; if (index >= array->map.max_entries) return -E2BIG; if (map->ops->map_poke_run) { mutex_lock(&array->aux->poke_mutex); old_ptr = xchg(array->ptrs + index, NULL); map->ops->map_poke_run(map, index, old_ptr, NULL); mutex_unlock(&array->aux->poke_mutex); } else { old_ptr = xchg(array->ptrs + index, NULL); } if (old_ptr) { map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } static long fd_array_map_delete_elem(struct bpf_map *map, void *key) { return __fd_array_map_delete_elem(map, key, true); } static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_prog *prog = bpf_prog_get(fd); bool is_extended; if (IS_ERR(prog)) return prog; if (prog->type == BPF_PROG_TYPE_EXT || !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } mutex_lock(&prog->aux->ext_mutex); is_extended = prog->aux->is_extended; if (!is_extended) prog->aux->prog_array_member_cnt++; mutex_unlock(&prog->aux->ext_mutex); if (is_extended) { /* Extended prog can not be tail callee. It's to prevent a * potential infinite loop like: * tail callee prog entry -> tail callee prog subprog -> * freplace prog entry --tailcall-> tail callee prog entry. */ bpf_prog_put(prog); return ERR_PTR(-EBUSY); } return prog; } static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_prog *prog = ptr; mutex_lock(&prog->aux->ext_mutex); prog->aux->prog_array_member_cnt--; mutex_unlock(&prog->aux->ext_mutex); /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(prog); } static u32 prog_fd_array_sys_lookup_elem(void *ptr) { return ((struct bpf_prog *)ptr)->aux->id; } /* decrement refcnt of all bpf_progs that are stored in this map */ static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { void **elem, *ptr; u32 prog_id; rcu_read_lock(); elem = array_map_lookup_elem(map, key); if (elem) { ptr = READ_ONCE(*elem); if (ptr) { seq_printf(m, "%u: ", *(u32 *)key); prog_id = prog_fd_array_sys_lookup_elem(ptr); btf_type_seq_show(map->btf, map->btf_value_type_id, &prog_id, m); seq_putc(m, '\n'); } } rcu_read_unlock(); } struct prog_poke_elem { struct list_head list; struct bpf_prog_aux *aux; }; static int prog_array_map_poke_track(struct bpf_map *map, struct bpf_prog_aux *prog_aux) { struct prog_poke_elem *elem; struct bpf_array_aux *aux; int ret = 0; aux = container_of(map, struct bpf_array, map)->aux; mutex_lock(&aux->poke_mutex); list_for_each_entry(elem, &aux->poke_progs, list) { if (elem->aux == prog_aux) goto out; } elem = kmalloc(sizeof(*elem), GFP_KERNEL); if (!elem) { ret = -ENOMEM; goto out; } INIT_LIST_HEAD(&elem->list); /* We must track the program's aux info at this point in time * since the program pointer itself may not be stable yet, see * also comment in prog_array_map_poke_run(). */ elem->aux = prog_aux; list_add_tail(&elem->list, &aux->poke_progs); out: mutex_unlock(&aux->poke_mutex); return ret; } static void prog_array_map_poke_untrack(struct bpf_map *map, struct bpf_prog_aux *prog_aux) { struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; mutex_lock(&aux->poke_mutex); list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { if (elem->aux == prog_aux) { list_del_init(&elem->list); kfree(elem); break; } } mutex_unlock(&aux->poke_mutex); } void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old) { WARN_ON_ONCE(1); } static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { struct prog_poke_elem *elem; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex)); list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; /* Few things to be aware of: * * 1) We can only ever access aux in this context, but * not aux->prog since it might not be stable yet and * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms * entry. We skip these as poke->tailcall_target_stable * is not active yet. The JIT will do the final fixup * before setting it stable. The various * poke->tailcall_target_stable are successively * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; if (poke->tail_call.map != map || poke->tail_call.key != key) continue; bpf_arch_poke_desc_update(poke, new, old); } } } static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; bpf_fd_array_map_clear(map, true); bpf_map_put(map); } static void prog_array_map_clear(struct bpf_map *map) { struct bpf_array_aux *aux = container_of(map, struct bpf_array, map)->aux; bpf_map_inc(map); schedule_work(&aux->work); } static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) { struct bpf_array_aux *aux; struct bpf_map *map; aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); if (!aux) return ERR_PTR(-ENOMEM); INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); map = array_map_alloc(attr); if (IS_ERR(map)) { kfree(aux); return map; } container_of(map, struct bpf_array, map)->aux = aux; aux->map = map; return map; } static void prog_array_map_free(struct bpf_map *map) { struct prog_poke_elem *elem, *tmp; struct bpf_array_aux *aux; aux = container_of(map, struct bpf_array, map)->aux; list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) { list_del_init(&elem->list); kfree(elem); } kfree(aux); fd_array_map_free(map); } /* prog_array->aux->{type,jited} is a runtime binding. * Doing static check alone in the verifier is not enough. * Thus, prog_array_map cannot be used as an inner_map * and map_meta_equal is not implemented. */ const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = prog_array_map_alloc, .map_free = prog_array_map_free, .map_poke_track = prog_array_map_poke_track, .map_poke_untrack = prog_array_map_poke_untrack, .map_poke_run = prog_array_map_poke_run, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = prog_array_map_clear, .map_seq_show_elem = prog_array_map_seq_show_elem, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, struct file *map_file) { struct bpf_event_entry *ee; ee = kzalloc(sizeof(*ee), GFP_KERNEL); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; ee->map_file = map_file; } return ee; } static void __bpf_event_entry_free(struct rcu_head *rcu) { struct bpf_event_entry *ee; ee = container_of(rcu, struct bpf_event_entry, rcu); fput(ee->perf_file); kfree(ee); } static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) { call_rcu(&ee->rcu, __bpf_event_entry_free); } static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; u64 value; perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); if (ee) return ee; ee = ERR_PTR(-ENOMEM); err_out: fput(perf_file); return ee; } static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } static void perf_event_fd_array_release(struct bpf_map *map, struct file *map_file) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_event_entry *ee; int i; if (map->map_flags & BPF_F_PRESERVE_ELEMS) return; rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = perf_event_fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, .map_release = perf_event_fd_array_release, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; #ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int fd) { return cgroup_get_from_fd(fd); } static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); } static void cgroup_fd_array_free(struct bpf_map *map) { bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } const struct bpf_map_ops cgroup_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], }; #endif static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) { struct bpf_map *map, *inner_map_meta; inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); if (IS_ERR(inner_map_meta)) return inner_map_meta; map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; } map->inner_map_meta = inner_map_meta; return map; } static void array_of_map_free(struct bpf_map *map) { /* map->inner_map_meta is only accessed by syscall which * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_map **inner_map = array_map_lookup_elem(map, key); if (!inner_map) return NULL; return READ_ONCE(*inner_map); } static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = array->elem_size; struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); *insn++ = BPF_MOV64_IMM(ret, 0); return insn - insn_buf; } const struct bpf_map_ops array_of_maps_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_of_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, .map_gen_lookup = array_of_map_gen_lookup, .map_lookup_batch = generic_map_lookup_batch, .map_update_batch = generic_map_update_batch, .map_check_btf = map_check_no_btf, .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], };
89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _TRACE_SYSCALL_H #define _TRACE_SYSCALL_H #include <linux/tracepoint.h> #include <linux/unistd.h> #include <linux/trace_events.h> #include <linux/thread_info.h> #include <asm/ptrace.h> /* * A syscall entry in the ftrace syscalls array. * * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) * @enter_fields: list of fields for syscall_enter trace event * @enter_event: associated syscall_enter trace event * @exit_event: associated syscall_exit trace event */ struct syscall_metadata { const char *name; int syscall_nr; int nb_args; const char **types; const char **args; struct list_head enter_fields; struct trace_event_call *enter_event; struct trace_event_call *exit_event; }; #if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) static inline void syscall_tracepoint_update(struct task_struct *p) { if (test_syscall_work(SYSCALL_TRACEPOINT)) set_task_syscall_work(p, SYSCALL_TRACEPOINT); else clear_task_syscall_work(p, SYSCALL_TRACEPOINT); } #else static inline void syscall_tracepoint_update(struct task_struct *p) { } #endif #endif /* _TRACE_SYSCALL_H */
62 62 62 62 39 39 38 38 46 46 46 44 44 46 26 45 22 62 62 62 62 62 62 62 62 62 62 62 25 1 9 2 433 430 433 433 433 433 434 431 433 433 434 432 434 434 434 198 402 403 434 23 23 1 1 1 3 1 41 1 1 41 1 1 40 40 1 39 40 2 40 39 2 2 40 40 1 40 3 39 39 38 39 1 1 38 1 37 1 36 1 1 1 35 36 36 4 3 33 5 4 28 41 11 20 21 12 4 44 28 1 43 1 42 1 41 21 21 28 23 23 23 22 23 22 23 23 22 23 23 23 23 1 23 1 23 22 23 19 1 1 1 1 1 1 1 1 1 22 25 25 25 25 25 25 25 25 2 2 2 25 25 24 25 1 25 1 25 25 2 25 1 25 1 25 1 25 1 25 25 1 25 25 2 2 2 2 2 2 2 2 2 3 3 2 1 1 1 1 1 1 1 1 1 3 23 23 23 23 100 100 3 98 38 38 36 191 191 100 100 38 38 191 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/fib_rules.c Generic Routing Rules * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> #include <linux/indirect_call_wrapper.h> #if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES) #ifdef CONFIG_IP_MULTIPLE_TABLES #define INDIRECT_CALL_MT(f, f2, f1, ...) \ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #elif defined(CONFIG_IP_MULTIPLE_TABLES) #define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) #else #define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__) #endif static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(0), KUIDT_INIT(~0), }; bool fib_rule_matchall(const struct fib_rule *rule) { if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; if (fib_rule_port_range_set(&rule->sport_range)) return false; if (fib_rule_port_range_set(&rule->dport_range)) return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table) { struct fib_rule *r; r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; } EXPORT_SYMBOL(fib_default_rule_add); static u32 fib_default_rule_pref(struct fib_rules_ops *ops) { struct list_head *pos; struct fib_rule *rule; if (!list_empty(&ops->rules_list)) { pos = ops->rules_list.next; if (pos->next != &ops->rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; } } return 0; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); static struct fib_rules_ops *lookup_rules_ops(const struct net *net, int family) { struct fib_rules_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (ops->family == family) { if (!try_module_get(ops->owner)) ops = NULL; rcu_read_unlock(); return ops; } } rcu_read_unlock(); return NULL; } static void rules_ops_put(struct fib_rules_ops *ops) { if (ops) module_put(ops->owner); } static void flush_route_cache(struct fib_rules_ops *ops) { if (ops->flush_cache) ops->flush_cache(ops); } static int __fib_rules_register(struct fib_rules_ops *ops) { int err = -EEXIST; struct fib_rules_ops *o; struct net *net; net = ops->fro_net; if (ops->rule_size < sizeof(struct fib_rule)) return -EINVAL; if (ops->match == NULL || ops->configure == NULL || ops->compare == NULL || ops->fill == NULL || ops->action == NULL) return -EINVAL; spin_lock(&net->rules_mod_lock); list_for_each_entry(o, &net->rules_ops, list) if (ops->family == o->family) goto errout; list_add_tail_rcu(&ops->list, &net->rules_ops); err = 0; errout: spin_unlock(&net->rules_mod_lock); return err; } struct fib_rules_ops * fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) { struct fib_rules_ops *ops; int err; ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); if (ops == NULL) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&ops->rules_list); ops->fro_net = net; err = __fib_rules_register(ops); if (err) { kfree(ops); ops = ERR_PTR(err); } return ops; } EXPORT_SYMBOL_GPL(fib_rules_register); static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); if (ops->delete) ops->delete(rule); fib_rule_put(rule); } } void fib_rules_unregister(struct fib_rules_ops *ops) { struct net *net = ops->fro_net; spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); spin_unlock(&net->rules_mod_lock); fib_rules_cleanup_ops(ops); kfree_rcu(ops, rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); static int uid_range_set(struct fib_kuid_range *range) { return uid_valid(range->start) && uid_valid(range->end); } static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb) { struct fib_rule_uid_range *in; struct fib_kuid_range out; in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]); out.start = make_kuid(current_user_ns(), in->start); out.end = make_kuid(current_user_ns(), in->end); return out; } static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) { struct fib_rule_uid_range out = { from_kuid_munged(current_user_ns(), range->start), from_kuid_munged(current_user_ns(), range->end) }; return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } static int nla_get_port_range(struct nlattr *pattr, struct fib_rule_port_range *port_range) { const struct fib_rule_port_range *pr = nla_data(pattr); if (!fib_rule_port_range_valid(pr)) return -EINVAL; port_range->start = pr->start; port_range->end = pr->end; return 0; } static int nla_put_port_range(struct sk_buff *skb, int attrtype, struct fib_rule_port_range *range) { return nla_put(skb, attrtype, sizeof(*range), range); } static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { int ret = 0; if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) goto out; if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) goto out; if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; if (uid_lt(fl->flowi_uid, rule->uid_range.start) || uid_gt(fl->flowi_uid, rule->uid_range.end)) goto out; ret = INDIRECT_CALL_MT(ops->match, fib6_rule_match, fib4_rule_match, rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; } int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { struct fib_rule *rule; int err; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { jumped: if (!fib_rule_match(rule, ops, fl, flags, arg)) continue; if (rule->action == FR_ACT_GOTO) { struct fib_rule *target; target = rcu_dereference(rule->ctarget); if (target == NULL) { continue; } else { rule = target; goto jumped; } } else if (rule->action == FR_ACT_NOP) continue; else err = INDIRECT_CALL_MT(ops->action, fib6_rule_action, fib4_rule_action, rule, fl, flags, arg); if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, rule, flags, arg)) continue; if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } break; } } err = -ESRCH; out: rcu_read_unlock(); return err; } EXPORT_SYMBOL_GPL(fib_rules_lookup); static int call_fib_rule_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_rule *rule, int family, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = family, .info.extack = extack, .rule = rule, }; return call_fib_notifier(nb, event_type, &info.info); } static int call_fib_rule_notifiers(struct net *net, enum fib_event_type event_type, struct fib_rule *rule, struct fib_rules_ops *ops, struct netlink_ext_ack *extack) { struct fib_rule_notifier_info info = { .info.family = ops->family, .info.extack = extack, .rule = rule, }; ASSERT_RTNL(); /* Paired with READ_ONCE() in fib_rules_seq() */ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } /* Called with rcu_read_lock() */ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack) { struct fib_rules_ops *ops; struct fib_rule *rule; int err = 0; ops = lookup_rules_ops(net, family); if (!ops) return -EAFNOSUPPORT; list_for_each_entry_rcu(rule, &ops->rules_list, list) { err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD, rule, family, extack); if (err) break; } rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_rules_dump); unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; ops = lookup_rules_ops(net, family); if (!ops) return 0; /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; } EXPORT_SYMBOL_GPL(fib_rules_seq_read); static struct fib_rule *rule_find(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule, bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (rule->action && r->action != rule->action) continue; if (rule->table && r->table != rule->table) continue; if (user_priority && r->pref != rule->pref) continue; if (rule->iifname[0] && memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (rule->oifname[0] && memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (rule->mark && r->mark != rule->mark) continue; if (rule->suppress_ifgroup != -1 && r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (rule->suppress_prefixlen != -1 && r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; if (uid_range_set(&rule->uid_range) && (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; if (rule->proto && r->proto != rule->proto) continue; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return r; } return NULL; } #ifdef CONFIG_NET_L3_MASTER_DEV static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { nlrule->l3mdev = nla_get_u8(nla); if (nlrule->l3mdev != 1) { NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); return -1; } return 0; } #else static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); return -1; } #endif static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; if (frh->src_len) if (!tb[FRA_SRC] || frh->src_len > (ops->addr_size * 8) || nla_len(tb[FRA_SRC]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid source address"); goto errout; } if (frh->dst_len) if (!tb[FRA_DST] || frh->dst_len > (ops->addr_size * 8) || nla_len(tb[FRA_DST]) != ops->addr_size) { NL_SET_ERR_MSG(extack, "Invalid dst address"); goto errout; } nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; } refcount_set(&nlrule->refcnt, 1); nlrule->fr_net = net; if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; } else { nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { struct net_device *dev; nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); if (tb[FRA_L3MDEV] && fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) goto errout_free; nlrule->action = frh->action; nlrule->flags = frh->flags; nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { if (nlrule->action != FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; } nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ if (nlrule->target <= nlrule->pref) { NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; } if (nlrule->l3mdev && nlrule->table) { NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } nlrule->uid_range = nla_get_kuid_range(tb); if (!uid_range_set(&nlrule->uid_range) || !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; } } else { nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], &nlrule->sport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], &nlrule->dport_range); if (err) { NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } } *rule = nlrule; return 0; errout_free: kfree(nlrule); errout: return err; } static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { if (r->action != rule->action) continue; if (r->table != rule->table) continue; if (r->pref != rule->pref) continue; if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; if (r->mark != rule->mark) continue; if (r->suppress_ifgroup != rule->suppress_ifgroup) continue; if (r->suppress_prefixlen != rule->suppress_prefixlen) continue; if (r->mark_mask != rule->mark_mask) continue; if (r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; if (r->l3mdev != rule->l3mdev) continue; if (!uid_eq(r->uid_range.start, rule->uid_range.start) || !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; if (r->ip_proto != rule->ip_proto) continue; if (r->proto != rule->proto) continue; if (!fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; return 1; } return 0; } static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [FRA_PRIORITY] = { .type = NLA_U32 }, [FRA_FWMARK] = { .type = NLA_U32 }, [FRA_FLOW] = { .type = NLA_U32 }, [FRA_TUN_ID] = { .type = NLA_U64 }, [FRA_FWMASK] = { .type = NLA_U32 }, [FRA_TABLE] = { .type = NLA_U32 }, [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, [FRA_GOTO] = { .type = NLA_U32 }, [FRA_L3MDEV] = { .type = NLA_U8 }, [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (!ops) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); if (err < 0) goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref == rule->target) { RCU_INIT_POINTER(rule->ctarget, r); break; } } if (rcu_dereference_protected(rule->ctarget, 1) == NULL) unresolved = 1; list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; } if (last) list_add_rcu(&rule->list, &last->list); else list_add_rcu(&rule->list, &ops->rules_list); if (ops->unresolved_rules) { /* * There are unresolved goto rules in the list, check if * any of them are pointing to this new rule. */ list_for_each_entry(r, &ops->rules_list, list) { if (r->action == FR_ACT_GOTO && r->target == rule->pref && rtnl_dereference(r->ctarget) == NULL) { rcu_assign_pointer(r->ctarget, rule); if (--ops->unresolved_rules == 0) break; } } } if (rule->action == FR_ACT_GOTO) ops->nr_goto_rules++; if (unresolved) ops->unresolved_rules++; if (rule->tun_id) ip_tunnel_need_metadata(); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: kfree(rule); errout: rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_newrule); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; int err = -EINVAL; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX, fib_rule_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; } err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; goto errout; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; goto errout; } if (ops->delete) { err = ops->delete(rule); if (err) goto errout; } if (rule->tun_id) ip_tunnel_unneed_metadata(); list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { ops->nr_goto_rules--; if (rtnl_dereference(rule->ctarget) == NULL) ops->unresolved_rules--; } /* * Check if this rule is a target to any of them. If so, * adjust to the next one with the same preference or * disable them. As this operation is eventually very * expensive, it is only performed if goto rules, except * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { struct fib_rule *n; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) n = NULL; list_for_each_entry(r, &ops->rules_list, list) { if (rtnl_dereference(r->ctarget) != rule) continue; rcu_assign_pointer(r->ctarget, n); if (!n) ops->unresolved_rules++; } } call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; errout: kfree(nlrule); rules_ops_put(ops); return err; } EXPORT_SYMBOL_GPL(fib_nl_delrule); static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) { size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + nla_total_size(4) /* FRA_PRIORITY */ + nla_total_size(4) /* FRA_TABLE */ + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); return payload; } static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, u32 pid, u32 seq, int type, int flags, struct fib_rules_ops *ops) { struct nlmsghdr *nlh; struct fib_rule_hdr *frh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); if (nlh == NULL) return -EMSGSIZE; frh = nlmsg_data(nlh); frh->family = ops->family; frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; if (rule->iifindex == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; if (rule->oifindex == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } if ((rule->pref && nla_put_u32(skb, FRA_PRIORITY, rule->pref)) || (rule->mark && nla_put_u32(skb, FRA_FWMARK, rule->mark)) || ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) || (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup)) goto nla_put_failure; } if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, struct fib_rules_ops *ops) { int idx = 0; struct fib_rule *rule; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(rule, &ops->rules_list, list) { if (idx < cb->args[1]) goto skip; err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWRULE, NLM_F_MULTI, ops); if (err) break; skip: idx++; } rcu_read_unlock(); cb->args[1] = idx; rules_ops_put(ops); return err; } static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib_rule_hdr *frh; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for fib rule dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); return -EINVAL; } return 0; } static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int err, idx = 0, family; if (cb->strict_check) { err = fib_valid_dumprule_req(nlh, cb->extack); if (err < 0) return err; } family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); if (ops == NULL) return -EAFNOSUPPORT; return dump_rules(skb, cb, ops); } err = 0; rcu_read_lock(); list_for_each_entry_rcu(ops, &net->rules_ops, list) { if (idx < cb->args[0] || !try_module_get(ops->owner)) goto skip; err = dump_rules(skb, cb, ops); if (err < 0) break; cb->args[1] = 0; skip: idx++; } rcu_read_unlock(); cb->args[0] = idx; return err; } static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid) { struct net *net; struct sk_buff *skb; int err = -ENOMEM; net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); if (skb == NULL) goto errout; err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); if (err < 0) { /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) rule->iifindex = dev->ifindex; if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) rule->oifindex = dev->ifindex; } } static void detach_rules(struct list_head *rules, struct net_device *dev) { struct fib_rule *rule; list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) rule->iifindex = -1; if (rule->oifindex == dev->ifindex) rule->oifindex = -1; } } static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; ASSERT_RTNL(); switch (event) { case NETDEV_REGISTER: list_for_each_entry(ops, &net->rules_ops, list) attach_rules(&ops->rules_list, dev); break; case NETDEV_CHANGENAME: list_for_each_entry(ops, &net->rules_ops, list) { detach_rules(&ops->rules_list, dev); attach_rules(&ops->rules_list, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry(ops, &net->rules_ops, list) detach_rules(&ops->rules_list, dev); break; } return NOTIFY_DONE; } static struct notifier_block fib_rules_notifier = { .notifier_call = fib_rules_event, }; static int __net_init fib_rules_net_init(struct net *net) { INIT_LIST_HEAD(&net->rules_ops); spin_lock_init(&net->rules_mod_lock); return 0; } static void __net_exit fib_rules_net_exit(struct net *net) { WARN_ON_ONCE(!list_empty(&net->rules_ops)); } static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, }; static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule}, {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule}, {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init fib_rules_init(void) { int err; rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) goto fail; err = register_netdevice_notifier(&fib_rules_notifier); if (err < 0) goto fail_unregister; return 0; fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } subsys_initcall(fib_rules_init);
155 59 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMENS_H #define _LINUX_TIMENS_H #include <linux/sched.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/err.h> #include <linux/time64.h> struct user_namespace; extern struct user_namespace init_user_ns; struct vm_area_struct; struct timens_offsets { struct timespec64 monotonic; struct timespec64 boottime; }; struct time_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; struct timens_offsets offsets; struct page *vvar_page; /* If set prevents changing offsets after any task joined namespace. */ bool frozen_offsets; } __randomize_layout; extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { refcount_inc(&ns->ns.count); return ns; } struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { if (refcount_dec_and_test(&ns->ns.count)) free_time_ns(ns); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); struct proc_timens_offset { int clockid; struct timespec64 val; }; int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int n); static inline void timens_add_monotonic(struct timespec64 *ts) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; *ts = timespec64_add(*ts, ns_offsets->monotonic); } static inline void timens_add_boottime(struct timespec64 *ts) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; *ts = timespec64_add(*ts, ns_offsets->boottime); } static inline u64 timens_add_boottime_ns(u64 nsec) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; return nsec + timespec64_to_ns(&ns_offsets->boottime); } static inline void timens_sub_boottime(struct timespec64 *ts) { struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets; *ts = timespec64_sub(*ts, ns_offsets->boottime); } ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *offsets); static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) { struct time_namespace *ns = current->nsproxy->time_ns; if (likely(ns == &init_time_ns)) return tim; return do_timens_ktime_to_host(clockid, tim, &ns->offsets); } #else static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { return 0; } static inline void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { } static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { return NULL; } static inline void put_time_ns(struct time_namespace *ns) { } static inline struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (flags & CLONE_NEWTIME) return ERR_PTR(-EINVAL); return old_ns; } static inline void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { return; } static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) { return NULL; } static inline void timens_add_monotonic(struct timespec64 *ts) { } static inline void timens_add_boottime(struct timespec64 *ts) { } static inline u64 timens_add_boottime_ns(u64 nsec) { return nsec; } static inline void timens_sub_boottime(struct timespec64 *ts) { } static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) { return tim; } #endif struct vdso_data *arch_get_vdso_data(void *vvar_page); #endif /* _LINUX_TIMENS_H */
24 3 3 3 25 25 25 24 25 25 25 25 25 25 25 155 126 30 153 68 70 70 11 4 4 4 4 4 6 6 6 6 6 1 5 6 11 3 2 2 2 1 2 2 3 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 // SPDX-License-Identifier: GPL-2.0-only /* * Pid namespaces * * Authors: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/pid.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/syscalls.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> #include <uapi/linux/wait.h> #include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; /* * creates the kmem cache to allocate pids from. * @level: pid namespace level */ static struct kmem_cache *create_pid_cachep(unsigned int level) { /* Level 0 is init_pid_ns.pid_cachep */ struct kmem_cache **pkc = &pid_cache[level - 1]; struct kmem_cache *kc; char name[4 + 10 + 1]; unsigned int len; kc = READ_ONCE(*pkc); if (kc) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); len = struct_size_t(struct pid, numbers, level + 1); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); } static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); } static void dec_pid_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); } static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; int err; err = -EINVAL; if (!in_userns(parent_pid_ns->user_ns, user_ns)) goto out; err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; ucounts = inc_pid_namespaces(user_ns); if (!ucounts) goto out; err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out_dec; idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level); if (ns->pid_cachep == NULL) goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) goto out_free_idr; ns->ns.ops = &pidns_operations; refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif return ns; out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } static void delayed_free_pidns(struct rcu_head *p) { struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu); dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } static void destroy_pid_namespace(struct pid_namespace *ns) { ns_free_inum(&ns->ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); } void put_pid_ns(struct pid_namespace *ns) { struct pid_namespace *parent; while (ns != &init_pid_ns) { parent = ns->parent; if (!refcount_dec_and_test(&ns->ns.count)) break; destroy_pid_namespace(ns); ns = parent; } } EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); /* * Ignore SIGCHLD causing any terminated children to autoreap. * This speeds up the namespace shutdown, plus see the comment * below. */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; spin_unlock_irq(&me->sighand->siglock); /* * The last thread in the cgroup-init thread group is terminating. * Find remaining pid_ts in the namespace, signal and wait for them * to exit. * * Note: This signals each threads in the namespace - even those that * belong to the same thread group, To avoid this, we would have * to walk the entire tasklist looking a processes in this * namespace, but that could be unnecessarily expensive if the * pid namespace has just a few processes. Or we need to * maintain a tasklist for each pid namespace. * */ rcu_read_lock(); read_lock(&tasklist_lock); nr = 2; idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); } read_unlock(&tasklist_lock); rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. * kernel_wait4() will also block until our children traced from the * parent namespace are detached and become EXIT_DEAD. */ do { clear_thread_flag(TIF_SIGPENDING); clear_thread_flag(TIF_NOTIFY_SIGNAL); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE * process whose parents processes are outside of the pid * namespace. Such processes are created with setns()+fork(). * * If those EXIT_ZOMBIE processes are not reaped by their * parents before their parents exit, they will be reparented * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * * Semantically it is also desirable to wait for EXIT_ZOMBIE * processes before allowing the child_reaper to be reaped, as * that gives the invariant that when the init process of a * pid namespace is reaped all of the processes in the pid * namespace are gone. * * Once all of the other tasks are gone from the pid_namespace * free_pid() will awaken this task. */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; schedule(); } __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; acct_exit_ns(pid_ns); return; } #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; int ret, next; if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); return ret; } extern int pid_max; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &pid_max, }, }; #endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { if (pid_ns == &init_pid_ns) return 0; switch (cmd) { case LINUX_REBOOT_CMD_RESTART2: case LINUX_REBOOT_CMD_RESTART: pid_ns->reboot = SIGHUP; break; case LINUX_REBOOT_CMD_POWER_OFF: case LINUX_REBOOT_CMD_HALT: pid_ns->reboot = SIGINT; break; default: return -EINVAL; } read_lock(&tasklist_lock); send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); /* Not reached */ return 0; } static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) { return container_of(ns, struct pid_namespace, ns); } static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(task); if (ns) get_pid_ns(ns); rcu_read_unlock(); return ns ? &ns->ns : NULL; } static struct ns_common *pidns_for_children_get(struct task_struct *task) { struct pid_namespace *ns = NULL; task_lock(task); if (task->nsproxy) { ns = task->nsproxy->pid_ns_for_children; get_pid_ns(ns); } task_unlock(task); if (ns) { read_lock(&tasklist_lock); if (!ns->child_reaper) { put_pid_ns(ns); ns = NULL; } read_unlock(&tasklist_lock); } return ns ? &ns->ns : NULL; } static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); } static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* * Only allow entering the current active pid namespace * or a child of the current active pid namespace. * * This is required for fork to return a usable pid value and * this maintains the property that processes and their * children can not escape their current pid namespace. */ if (new->level < active->level) return -EINVAL; ancestor = new; while (ancestor->level > active->level) ancestor = ancestor->parent; if (ancestor != active) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); nsproxy->pid_ns_for_children = get_pid_ns(new); return 0; } static struct ns_common *pidns_get_parent(struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *pid_ns, *p; /* See if the parent is in the current namespace */ pid_ns = p = to_pid_ns(ns)->parent; for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == active) break; p = p->parent; } return &get_pid_ns(pid_ns)->ns; } static struct user_namespace *pidns_owner(struct ns_common *ns) { return to_pid_ns(ns)->user_ns; } const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_init("kernel", pid_ns_ctl_table); #endif register_pid_ns_sysctl_table_vm(); return 0; } __initcall(pid_namespaces_init);
1922 2 17 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/net_namespace.h> #include <linux/if_arp.h> #include <net/rtnetlink.h> static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev) { dev_lstats_add(dev, skb->len); dev_kfree_skb(skb); return NETDEV_TX_OK; } struct nlmon { struct netlink_tap nt; }; static int nlmon_open(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); nlmon->nt.dev = dev; nlmon->nt.module = THIS_MODULE; return netlink_add_tap(&nlmon->nt); } static int nlmon_close(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); return netlink_remove_tap(&nlmon->nt); } static void nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes); } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops nlmon_ethtool_ops = { .get_link = always_on, }; static const struct net_device_ops nlmon_ops = { .ndo_open = nlmon_open, .ndo_stop = nlmon_close, .ndo_start_xmit = nlmon_xmit, .ndo_get_stats64 = nlmon_get_stats64, }; static void nlmon_setup(struct net_device *dev) { dev->type = ARPHRD_NETLINK; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->netdev_ops = &nlmon_ops; dev->ethtool_ops = &nlmon_ethtool_ops; dev->needs_free_netdev = true; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->flags = IFF_NOARP; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; /* That's rather a softlimit here, which, of course, * can be altered. Not a real MTU, but what is to be * expected in most cases. */ dev->mtu = NLMSG_GOODSIZE; dev->min_mtu = sizeof(struct nlmsghdr); } static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) return -EINVAL; return 0; } static struct rtnl_link_ops nlmon_link_ops __read_mostly = { .kind = "nlmon", .priv_size = sizeof(struct nlmon), .setup = nlmon_setup, .validate = nlmon_validate, }; static __init int nlmon_register(void) { return rtnl_link_register(&nlmon_link_ops); } static __exit void nlmon_unregister(void) { rtnl_link_unregister(&nlmon_link_ops); } module_init(nlmon_register); module_exit(nlmon_unregister); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>"); MODULE_DESCRIPTION("Netlink monitoring device"); MODULE_ALIAS_RTNL_LINK("nlmon");
15 15 71 71 70 71 3 4 4 3 3 3 4 195 193 196 196 195 196 196 40 40 40 40 40 99 100 100 15 15 15 11 4 15 10 5 15 3 3 10 10 10 8 8 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); trace_tcp_cong_state_set(sk, ca_state); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } /* Must be called with rcu lock held */ static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif return ca; } /* Simple linear search, not much in here. */ struct tcp_congestion_ops *tcp_ca_find_key(u32 key) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (e->key == key) return e; } return NULL; } int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) { /* all algorithms must implement these */ if (!ca->ssthresh || !ca->undo_cwnd || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } return 0; } /* Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { pr_notice("%s already registered or non-unique key\n", ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module gets removed entirely. * * A try_module_get() should fail by now as our module is * in "going" state since no refs are held anymore and * module_exit() handler being called. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Replace a registered old ca with a new one. * * The new ca must have the same name as the old one, that has been * registered. */ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) { struct tcp_congestion_ops *existing; int ret = 0; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); existing = tcp_ca_find_key(old_ca->key); if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { pr_notice("%s not registered or non-unique key\n", ca->name); ret = -EINVAL; } else if (existing != old_ca) { pr_notice("invalid old congestion control algorithm to replace\n"); ret = -EINVAL; } else { /* Add the new one before removing the old one to keep * one implementation available all the time. */ list_add_tail_rcu(&ca->list, &tcp_cong_list); list_del_rcu(&existing->list); pr_debug("%s updated\n", ca->name); } spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module or struct_ops gets removed entirely. */ if (!ret) synchronize_rcu(); return ret; } u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = tcp_ca_find_autoload(name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; } rcu_read_unlock(); return key; } char *tcp_ca_get_name_by_key(u32 key, char *buffer) { const struct tcp_congestion_ops *ca; char *ret = NULL; rcu_read_lock(); ca = tcp_ca_find_key(key); if (ca) { strscpy(buffer, ca->name, TCP_CA_NAME_MAX); ret = buffer; } rcu_read_unlock(); return ret; } /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); icsk->icsk_ca_initialized = 0; bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *prev; int ret; rcu_read_lock(); ca = tcp_ca_find_autoload(name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else if (!net_eq(net, &init_net) && !(ca->flags & TCP_CONG_NON_RESTRICTED)) { /* Only init netns can set default to a restricted algorithm */ ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } rcu_read_unlock(); return ret; } /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(&init_net, CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Get current default congestion control */ void tcp_get_default_congestion_control(struct net *net, char *name) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); strscpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } /* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *saved_clone, *clone, *name; int ret = 0; saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); kfree(saved_clone); return ret; } /* Change congestion control for socket. If load is false, then it is the * responsibility of the caller to call tcp_init_congestion_control or * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; int err = 0; if (icsk->icsk_ca_dst_locked) return -EPERM; rcu_read_lock(); if (!load) ca = tcp_ca_find(name); else ca = tcp_ca_find_autoload(name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; else tcp_reinit_congestion_control(sk, ca); out: rcu_read_unlock(); return err; } /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but * something better;) a packet is only considered (s)acked in its entirety to * defend the ACK attacks described in the RFC. Slow start processes a stretch * ACK of degree N as if N acks of degree 1 are received back to back except * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ __bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); acked -= cwnd - tcp_snd_cwnd(tp); tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ __bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ __bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } /* In dangerous area, increase slowly. */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ __bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); __bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, };
3488 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tree variant. * * Copyright (C) IBM Corporation, 2017 * * Author: Paul McKenney <paulmck@linux.ibm.com> */ #ifndef _LINUX_SRCU_TREE_H #define _LINUX_SRCU_TREE_H #include <linux/rcu_node_tree.h> #include <linux/completion.h> struct srcu_node; struct srcu_struct; /* * Per-CPU structure feeding into leaf srcu_node, similar in function * to rcu_node. */ struct srcu_data { /* Read-side state. */ atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ bool srcu_cblist_invoking; /* Invoking these CBs? */ struct timer_list delay_work; /* Delay for CB invoking */ struct work_struct work; /* Context for CB invoking. */ struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ struct srcu_node *mynode; /* Leaf srcu_node. */ unsigned long grpmask; /* Mask for leaf srcu_node */ /* ->srcu_data_have_cbs[]. */ int cpu; struct srcu_struct *ssp; }; /* Values for ->srcu_reader_flavor. */ #define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). #define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). #define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). /* * Node in SRCU combining tree, similar in function to rcu_data. */ struct srcu_node { spinlock_t __private lock; unsigned long srcu_have_cbs[4]; /* GP seq for children having CBs, but only */ /* if greater than ->srcu_gp_seq. */ unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs have CBs for given GP? */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ struct srcu_node *srcu_parent; /* Next up in tree. */ int grplo; /* Least CPU for node. */ int grphi; /* Biggest CPU for node. */ }; /* * Per-SRCU-domain structure, update-side data linked from srcu_struct. */ struct srcu_usage { struct srcu_node *node; /* Combining tree. */ struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ int srcu_size_state; /* Small-to-big transition state. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ spinlock_t __private lock; /* Protect counters and size state. */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ unsigned long srcu_gp_start; /* Last GP start timestamp (jiffies) */ unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ unsigned long srcu_size_jiffies; /* Current contention-measurement interval. */ unsigned long srcu_n_lock_retries; /* Contention events in current interval. */ unsigned long srcu_n_exp_nodelay; /* # expedited no-delays in current GP phase. */ bool sda_is_static; /* May ->sda be passed to free_percpu()? */ unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ struct completion srcu_barrier_completion; /* Awaken barrier rq at end. */ atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ /* callback for the barrier */ /* operation. */ unsigned long reschedule_jiffies; unsigned long reschedule_count; struct delayed_work work; struct srcu_struct *srcu_ssp; }; /* * Per-SRCU-domain structure, similar in function to rcu_state. */ struct srcu_struct { unsigned int srcu_idx; /* Current rdr array element. */ struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ }; // Values for size state variable (->srcu_size_state). Once the state // has been set to SRCU_SIZE_ALLOC, the grace-period code advances through // this state machine one step per grace period until the SRCU_SIZE_BIG state // is reached. Otherwise, the state machine remains in the SRCU_SIZE_SMALL // state indefinitely. #define SRCU_SIZE_SMALL 0 // No srcu_node combining tree, ->node == NULL #define SRCU_SIZE_ALLOC 1 // An srcu_node tree is being allocated, initialized, // and then referenced by ->node. It will not be used. #define SRCU_SIZE_WAIT_BARRIER 2 // The srcu_node tree starts being used by everything // except call_srcu(), especially by srcu_barrier(). // By the end of this state, all CPUs and threads // are aware of this tree's existence. #define SRCU_SIZE_WAIT_CALL 3 // The srcu_node tree starts being used by call_srcu(). // By the end of this state, all of the call_srcu() // invocations that were running on a non-boot CPU // and using the boot CPU's callback queue will have // completed. #define SRCU_SIZE_WAIT_CBS1 4 // Don't trust the ->srcu_have_cbs[] grace-period #define SRCU_SIZE_WAIT_CBS2 5 // sequence elements or the ->srcu_data_have_cbs[] #define SRCU_SIZE_WAIT_CBS3 6 // CPU-bitmask elements until all four elements of #define SRCU_SIZE_WAIT_CBS4 7 // each array have been initialized. #define SRCU_SIZE_BIG 8 // The srcu_node combining tree is fully initialized // and all aspects of it are being put to use. /* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 /* * Values for initializing gp sequence fields. Higher values allow wrap arounds to * occur earlier. * The second value with state is useful in the case of static initialization of * srcu_usage where srcu_gp_seq_needed is expected to have some state value in its * lower bits (or else it will appear to be already initialized within * the call check_init_srcu_struct()). */ #define SRCU_GP_SEQ_INITIAL_VAL ((0UL - 100UL) << RCU_SEQ_CTR_SHIFT) #define SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE (SRCU_GP_SEQ_INITIAL_VAL - 1) #define __SRCU_USAGE_INIT(name) \ { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL, \ .srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE, \ .srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL, \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } #define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ .srcu_sup = &usage_name, \ __SRCU_DEP_MAP_INIT(name) #define __SRCU_STRUCT_INIT_MODULE(name, usage_name) \ { \ __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } #define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ { \ .sda = &pcpu_name, \ __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ } /* * Define and initialize a srcu struct at build time. * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. * * Note that although DEFINE_STATIC_SRCU() hides the name from other * files, the per-CPU variable rules nevertheless require that the * chosen name be globally unique. These rules also prohibit use of * DEFINE_STATIC_SRCU() within a function. If these rules are too * restrictive, declare the srcu_struct manually. For example, in * each file: * * static struct srcu_struct my_srcu; * * Then, before the first use of each my_srcu, manually initialize it: * * init_srcu_struct(&my_srcu); * * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #ifdef MODULE # define __DEFINE_SRCU(name, is_static) \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else # define __DEFINE_SRCU(name, is_static) \ static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ is_static struct srcu_struct name = \ __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) #endif #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Returns an index that must be passed to the matching * srcu_read_unlock_lite(). * * Note that this_cpu_inc() is an RCU read-side critical section either * because it disables interrupts, because it is a single instruction, * or because it is a read-modify-write atomic operation, depending on * the whims of the architecture. */ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp) { int idx; RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite()."); idx = READ_ONCE(ssp->srcu_idx) & 0x1; this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */ barrier(); /* Avoid leaking the critical section. */ return idx; } /* * Removes the count for the old reader from the appropriate * per-CPU element of the srcu_struct. Note that this may well be a * different CPU than that which was incremented by the corresponding * srcu_read_lock_lite(), but it must be within the same task. * * Note that this_cpu_inc() is an RCU read-side critical section either * because it disables interrupts, because it is a single instruction, * or because it is a read-modify-write atomic operation, depending on * the whims of the architecture. */ static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) { barrier(); /* Avoid leaking the critical section. */ this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite()."); } void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); // Record _lite() usage even for CONFIG_PROVE_RCU=n kernels. static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) { struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) return; // Note that the cmpxchg() in srcu_check_read_flavor() is fully ordered. __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); } // Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { if (IS_ENABLED(CONFIG_PROVE_RCU)) __srcu_check_read_flavor(ssp, read_flavor); } #endif
25 4 1 4 11 1 4 15 16 16 15 16 16 16 1 57 1 1 1 1 1 1 11 50 50 49 50 49 50 50 5 50 49 49 11 11 11 11 11 11 24 11 11 11 11 11 11 11 11 11 87 11 11 11 11 11 11 11 11 11 11 44 44 52 5 52 35 16 50 50 50 50 49 50 52 11 11 11 4 4 5 3 5 4 4 1 4 9 9 9 9 9 2 7 6 7 6 6 1 6 4 6 3 9 11 11 11 11 50 16 20 52 50 50 52 20 35 35 1 1 1 1 1 61 60 52 51 52 61 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static bool super_flags(const struct super_block *sb, unsigned int flags) { /* * Pairs with smp_store_release() in super_wake() and ensures * that we see @flags after we're woken. */ return smp_load_acquire(&sb->s_flags) & flags; } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: The function returns true if SB_BORN was set and with * s_umount held. The function returns false if SB_DYING was * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); /* Don't pointlessly acquire s_umount. */ if (super_flags(sb, SB_DYING)) return false; __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) { super_unlock(sb, excl); return false; } WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); return true; } /* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); fsnotify_sb_free(s); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) goto fail; s->s_shrink->scan_objects = super_cache_scan; s->s_shrink->count_objects = super_cache_count; s->s_shrink->batch = 1024; s->s_shrink->private_data = s; if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super(struct super_block *sb) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_excl(sb); if (locked) { if (atomic_inc_not_zero(&sb->s_active)) { put_super(sb); return true; } super_unlock_excl(sb); } wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; /* * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is * not set, as the filesystem is likely unprepared to handle it. * This can happen when fsconfig() is called from init_user_ns with * an fs_fd opened in another user namespace. */ if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); return ERR_PTR(-EPERM); } retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); shrinker_register(s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; /* We don't yet pass the user namespace of the parent * mount through to here so always use &init_user_ns * until that changes. */ if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); static void __iterate_supers(void (*f)(struct super_block *)) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); f(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { if (sb->s_root) f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { if (sb->s_root) f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { bool locked; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ locked = super_lock(sb, excl); if (locked) { if (sb->s_root) return sb; super_unlock(sb, excl); } /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); break; } } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb) { bool locked = super_lock_excl(sb); if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } if (locked) super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb) { bool locked = super_lock_excl(sb); if (locked && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); return; } if (locked) super_unlock_excl(sb); } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock the superblock that is holder of the bdev. Returns the superblock * pointer if we successfully locked the superblock and it is alive. Otherwise * we return NULL and just unlock bdev->bd_holder_lock. * * The function must be called with bdev->bd_holder_lock and releases it. */ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); lockdep_assert_not_held(&bdev->bd_disk->open_mutex); /* Make sure sb doesn't go away from under us */ spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_holder_lock); locked = super_lock(sb, excl); /* * If the superblock wasn't already SB_DYING then we hold * s_umount and can safely drop our temporary reference. */ put_super(sb); if (!locked) return NULL; if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock(sb, excl); return NULL; } return sb; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; sync_filesystem(sb); super_unlock_shared(sb); } static struct super_block *get_bdev_super(struct block_device *bdev) { bool active = false; struct super_block *sb; sb = bdev_super_lock(bdev, true); if (sb) { active = atomic_inc_not_zero(&sb->s_active); super_unlock_excl(sb); } if (!active) return NULL; return sb; } /** * fs_bdev_freeze - freeze owning filesystem of block device * @bdev: block device * * Freeze the filesystem that owns this block device if it is still * active. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ static int fs_bdev_freeze(struct block_device *bdev) { struct super_block *sb; int error = 0; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else error = freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); if (!error) error = sync_blockdev(bdev); deactivate_super(sb); return error; } /** * fs_bdev_thaw - thaw owning filesystem of block device * @bdev: block device * * Thaw the filesystem that owns this block device. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the thaw was successful zero is returned. If the thaw * failed a negative error code is returned. If this function * returns zero it doesn't mean that the filesystem is unfrozen * as it may have been frozen multiple times (kernel may hold a * freeze or might be frozen from other block devices). */ static int fs_bdev_thaw(struct block_device *bdev) { struct super_block *sb; int error; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); /* * The block device may have been frozen before it was claimed by a * filesystem. Concurrently another process might try to mount that * frozen block device and has temporarily claimed the block device for * that purpose causing a concurrent fs_bdev_thaw() to end up here. The * mounter is already about to abort mounting because they still saw an * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return * NULL in that case. */ sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else error = thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); deactivate_super(sb); return error; } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct file *bdev_file; struct block_device *bdev; bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev_file); } bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { bdev_fput(bdev_file); return -EACCES; } /* * It is enough to check bdev was not frozen before we set * s_bdev as freezing will wait until SB_BORN is set. */ if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock * @flags: GET_TREE_BDEV_* flags */ int get_tree_bdev_flags(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { error = setup_bdev_super(s, fc->sb_flags, fc); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL_GPL(get_tree_bdev_flags); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { return get_tree_bdev_flags(fc, fill_super, 0); } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { error = setup_bdev_super(s, flags, NULL); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); bdev_fput(sb->s_bdev_file); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); int reconfigure_single(struct super_block *s, int flags, void *data) { struct fs_context *fc; int ret; /* The caller really need to be passing fc down into mount_single(), * then a chunk of this can be removed. [Bollocks -- AV] * Better yet, reconfiguration shouldn't happen, but rather the second * mount should be rejected if the parameters are not compatible. */ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); ret = parse_monolithic_mount_data(fc, data); if (ret < 0) goto out; ret = reconfigure_super(fc); out: put_fs_context(fc); return ret; } static int compare_single(struct super_block *s, void *p) { return 1; } struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (!error) s->s_flags |= SB_ACTIVE; } else { error = reconfigure_single(s, flags, data); } if (unlikely(error)) { deactivate_locked_super(s); return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } #define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) #define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST) static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) ++sb->s_writers.freeze_kcount; if (who & FREEZE_HOLDER_USERSPACE) ++sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) --sb->s_writers.freeze_kcount; if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) --sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_kcount == 0; if (who & FREEZE_HOLDER_USERSPACE) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_ucount == 0; return false; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * A filesystem may hold multiple devices and thus a filesystems may be * frozen through the block layer via multiple block devices. In this * case the request is marked as being allowed to nest by passing * FREEZE_MAY_NEST. The filesystem remains frozen until all block * devices are unfrozen. If multiple freezes are attempted without * FREEZE_MAY_NEST -EBUSY will be returned. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while freezing!"); return -EINVAL; } atomic_inc(&sb->s_active); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (may_freeze(sb, who)) ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); else ret = -EBUSY; /* All freezers share a single active reference. */ deactivate_locked_super(sb); return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (sb_rdonly(sb)) { /* Nothing to do really... */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error = -EINVAL; if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) goto out_unlock; /* * All freezers share a single active reference. * So just unlock in case there are any left. */ if (freeze_dec(sb, who)) goto out_unlock; if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); goto out_deactivate; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { pr_err("VFS: Filesystem thaw failed\n"); freeze_inc(sb, who); lockdep_sb_freeze_release(sb); goto out_unlock; } } sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out_deactivate: deactivate_locked_super(sb); return 0; out_unlock: super_unlock_excl(sb); return error; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed * * A filesystem may hold multiple devices and thus a filesystems may * have been frozen through the block layer via multiple block devices. * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while thawing!"); return -EINVAL; } return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
52 100 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __VDSO_MATH64_H #define __VDSO_MATH64_H static __always_inline u32 __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { u32 ret = 0; while (dividend >= divisor) { /* The following asm() prevents the compiler from optimising this loop into a modulo operation. */ asm("" : "+rm"(dividend)); dividend -= divisor; ret++; } *remainder = dividend; return ret; } #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_add_u64_shr static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { return (u64)((((unsigned __int128)a * mul) + b) >> shift); } #endif /* mul_u64_u32_add_u64_shr */ #else #ifndef mul_u64_u32_add_u64_shr #ifndef mul_u32_u32 static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #define mul_u32_u32 mul_u32_u32 #endif static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { u32 ah = a >> 32, al = a; bool ovf; u64 ret; ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret); ret >>= shift; if (ovf && shift) ret += 1ULL << (64 - shift); if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_add_u64_shr */ #endif #endif /* __VDSO_MATH64_H */
3212 19 19 19 19 2 19 2 19 1031 1038 1045 1027 19 19 19 19 19 19 19 19 19 923 128 126 2 127 127 127 128 7 1 1 1 1 255 251 2525 2528 2517 2522 911 913 2085 2082 3205 3228 1 250 249 250 3003 2998 3017 3015 677 2992 2994 2988 2998 2998 268 2993 432 2657 1 1 431 429 2 427 426 428 22 2 22 6 2687 2205 124 2103 2094 1894 5 2223 2214 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include <linux/sched.h> /* test_thread_flag(), ... */ #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/extable.h> /* search_exception_tables */ #include <linux/memblock.h> /* max_low_pfn */ #include <linux/kfence.h> /* kfence_handle_page_fault */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> #include <linux/mm.h> /* find_and_lock_vma() */ #include <linux/vmalloc.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> #include <asm/fred.h> #include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> /* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ static nokprobe_inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; return 0; } /* * Prefetch quirks: * * 32-bit mode: * * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. * Check that here and ignore it. This is AMD erratum #91. * * 64-bit mode: * * Sometimes the CPU reports invalid exceptions on prefetch. * Check that here and ignore it. * * Opcode checker based on code by Richard Brunner. */ static inline int check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, unsigned char opcode, int *prefetch) { unsigned char instr_hi = opcode & 0xf0; unsigned char instr_lo = opcode & 0x0f; switch (instr_hi) { case 0x20: case 0x30: /* * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. * In X86_64 long mode, the CPU will signal invalid * opcode if some of these prefixes are present so * X86_64 will never get here anyway */ return ((instr_lo & 7) == 0x6); #ifdef CONFIG_X86_64 case 0x40: /* * In 64-bit mode 0x40..0x4F are valid REX prefixes */ return (!user_mode(regs) || user_64bit_mode(regs)); #endif case 0x60: /* 0x64 thru 0x67 are valid prefixes in all modes. */ return (instr_lo & 0xC) == 0x4; case 0xF0: /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ return !instr_lo || (instr_lo>>1) == 1; case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ if (get_kernel_nofault(opcode, instr)) return 0; *prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); return 0; default: return 0; } } static bool is_amd_k8_pre_npt(void) { struct cpuinfo_x86 *c = &boot_cpu_data; return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && c->x86_vendor == X86_VENDOR_AMD && c->x86 == 0xf && c->x86_model < 0x40); } static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; int prefetch = 0; /* Erratum #91 affects AMD K8, pre-NPT CPUs */ if (!is_amd_k8_pre_npt()) return 0; /* * If it was a exec (instruction fetch) fault on NX page, then * do not ignore the fault: */ if (error_code & X86_PF_INSTR) return 0; instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; /* * This code has historically always bailed out if IP points to a * not-present page (e.g. due to a race). No one has ever * complained about this. */ pagefault_disable(); while (instr < max_instr) { unsigned char opcode; if (user_mode(regs)) { if (get_user(opcode, (unsigned char __user *) instr)) break; } else { if (get_kernel_nofault(opcode, instr)) break; } instr++; if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; } pagefault_enable(); return prefetch; } DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); #ifdef CONFIG_X86_32 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pgd += index; pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) return NULL; /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would * set_p4d/set_pud. */ p4d = p4d_offset(pgd, address); p4d_k = p4d_offset(pgd_k, address); if (!p4d_present(*p4d_k)) return NULL; pud = pud_offset(p4d, address); pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); if (pmd_present(*pmd) != pmd_present(*pmd_k)) set_pmd(pmd, *pmd_k); if (!pmd_present(*pmd_k)) return NULL; else BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } /* * Handle a fault on the vmalloc or module mapping area * * This is needed because there is a race condition between the time * when the vmalloc mapping code updates the PMD to the point in time * where it synchronizes this update with the other page-tables in the * system. * * In this race window another thread/CPU can map an area on the same * PMD, finds it already present and does not synchronize it with the * rest of the system yet. As a result v[mz]alloc might return areas * which are not mapped in every page-table in the system, causing an * unhandled page-fault when they are accessed. */ static noinline int vmalloc_fault(unsigned long address) { unsigned long pgd_paddr; pmd_t *pmd_k; pte_t *pte_k; /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; if (pmd_leaf(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; return 0; } NOKPROBE_SYMBOL(vmalloc_fault); void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; for (addr = start & PMD_MASK; addr >= TASK_SIZE_MAX && addr < VMALLOC_END; addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; #ifdef CONFIG_X86_PAE pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #define pr_pde pr_cont #else #define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); #undef pr_pde /* * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: pr_cont("\n"); } #else /* CONFIG_X86_64: */ #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" "******* Working around it, but it may cause SEGVs or burn power.\n" "******* Please consider a BIOS update.\n" "******* Disabling USB legacy in the BIOS may also help.\n"; #endif static int bad_address(void *p) { unsigned long dummy; return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; if (bad_address(pgd)) goto bad; pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (bad_address(p4d)) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_leaf(*p4d)) goto out; pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; pr_cont("PTE %lx", pte_val(*pte)); out: pr_cont("\n"); return; bad: pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. * * BIOS SMM functions are required to use a specific workaround * to avoid corruption of the 64bit RIP register on C stepping K8. * * A lot of BIOS that didn't get tested properly miss this. * * The OS sees this as a page fault with the upper 32bits of RIP cleared. * Try to work around it here. * * Note we only handle faults in kernel here. * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 != 0xf) return 0; if (user_mode(regs)) return 0; if (address != regs->ip) return 0; if ((address >> 32) != 0) return 0; address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { printk_once(errata93_warning); regs->ip = address; return 1; } #endif return 0; } /* * Work around K8 erratum #100 K8 in compat mode occasionally jumps * to illegal addresses >4GB. * * We catch this in the page fault handler because these addresses * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; } /* Pentium F0 0F C7 C8 bug workaround: */ static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && idt_is_f00f_address(address)) { handle_invalid_op(regs); return 1; } #endif return 0; } static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) { u32 offset = (index >> 3) * sizeof(struct desc_struct); unsigned long addr; struct ldttss_desc desc; if (index == 0) { pr_alert("%s: NULL\n", name); return; } if (offset + sizeof(struct ldttss_desc) >= gdt->size) { pr_alert("%s: 0x%hx -- out of bounds\n", name, index); return; } if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), sizeof(struct ldttss_desc))) { pr_alert("%s: 0x%hx -- GDT entry is not readable\n", name, index); return; } addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); #ifdef CONFIG_X86_64 addr |= ((u64)desc.base3 << 32); #endif pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!oops_may_print()) return; if (error_code & X86_PF_INSTR) { unsigned int level; bool nx, rw; pgd_t *pgd; pte_t *pte; pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw); if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx)) pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && !nx && (pgd_flags(*pgd) & _PAGE_USER) && (__read_cr4() & X86_CR4_SMEP)) pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", from_kuid(&init_user_ns, current_uid())); } if (address < PAGE_SIZE && !user_mode(regs)) pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", (void *)address); else pr_alert("BUG: unable to handle page fault for address: %px\n", (void *)address); pr_alert("#PF: %s %s in %s mode\n", (error_code & X86_PF_USER) ? "user" : "supervisor", (error_code & X86_PF_INSTR) ? "instruction fetch" : (error_code & X86_PF_WRITE) ? "write access" : "read access", user_mode(regs) ? "user" : "kernel"); pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps * surprisingly, if the CPU tries to deliver a benign or * contributory exception from user code and gets a page fault * during delivery, the page fault can be delivered as though * it originated directly from user code. This could happen * due to wrong permissions on the IDT, GDT, LDT, TSS, or * kernel or IST stack. */ store_idt(&idt); /* Usable even on Xen PV -- it's just slow. */ native_store_gdt(&gdt); pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", idt.address, idt.size, gdt.address, gdt.size); store_ldt(ldtr); show_ldttss(&gdt, "LDTR", ldtr); store_tr(tr); show_ldttss(&gdt, "TR", tr); } dump_pagetable(address); if (error_code & X86_PF_RMP) snp_dump_hva_rmpentry(address); } static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk; unsigned long flags; int sig; flags = oops_begin(); tsk = current; sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); if (__die("Bad pagetable", regs, error_code)) sig = 0; oops_end(flags, regs, sig); } static void sanitize_error_code(unsigned long address, unsigned long *error_code) { /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. * * NB: This means that failed vsyscalls with vsyscall=none * will have the PROT bit. This doesn't leak any * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) *error_code |= X86_PF_PROT; } static void set_signal_archinfo(unsigned long address, unsigned long error_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; tsk->thread.cr2 = address; } static noinline void page_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { #ifdef CONFIG_VMAP_STACK struct stack_info info; #endif unsigned long flags; int sig; if (user_mode(regs)) { /* * Implicit kernel access from user mode? Skip the stack * overflow and EFI special cases. */ goto oops; } #ifdef CONFIG_VMAP_STACK /* * Stack overflow? During boot, we can fault near the initial * stack in the direct map, but that's not an overflow -- check * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && get_stack_guard_info((void *)address, &info)) { /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but * double-fault even before we get this far, in which case * we're fine: the double-fault handler will deal with it. * * We don't want to make it all the way into the oops code * and then double-fault, though, because we're likely to * break the console driver and lose most of the stack dump. */ call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*), handle_stack_overflow, ASM_CALL_ARG3, , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); unreachable(); } #endif /* * Buggy firmware could access regions which might page fault. If * this happens, EFI has a special OOPS path that will try to * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) efi_crash_gracefully_on_page_fault(address); /* Only not-present faults should be handled by KFENCE. */ if (!(error_code & X86_PF_PROT) && kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) return; oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ flags = oops_begin(); show_fault_oops(regs, error_code, address); if (task_stack_end_corrupted(current)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code, u32 pkey) { WARN_ON_ONCE(user_mode(regs)); /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) return; /* * AMD erratum #91 manifests as a spurious page fault on a PREFETCH * instruction. */ if (is_prefetch(regs, error_code, address)) return; page_fault_oops(regs, error_code, address); } /* * Print out info about fatal segfaults, if the show_unhandled_signals * sysctl is set: */ static inline void show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; /* This is a racy snapshot, but it's better than nothing. */ int cpu = raw_smp_processor_id(); if (!unhandled_signal(tsk, SIGSEGV)) return; if (!printk_ratelimit()) return; printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", loglvl, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); print_vma_addr(KERN_CONT " in ", regs->ip); /* * Dump the likely CPU where the fatal segfault happened. * This can help identify faulty hardware. */ printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, topology_core_id(cpu), topology_physical_package_id(cpu)); printk(KERN_CONT "\n"); show_opcodes(regs, loglvl); } static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) { struct task_struct *tsk = current; if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, si_code, pkey); return; } if (!(error_code & X86_PF_USER)) { /* Implicit user access to kernel memory -- just oops */ page_fault_oops(regs, error_code, address); return; } /* * User mode accesses just cause a SIGSEGV. * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came * from user space: */ if (is_prefetch(regs, error_code, address)) return; if (is_errata100(regs, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); set_signal_archinfo(address, error_code); if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); else force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); } static void __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct mm_struct *mm, struct vm_area_struct *vma, u32 pkey, int si_code) { /* * Something tried to access memory that isn't in our memory map.. * Fix it, but check if it's kernel or user first.. */ if (mm) mmap_read_unlock(mm); else vma_end_read(vma); __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { /* This code is always called on the current mm */ bool foreign = false; if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return false; if (error_code & X86_PF_PK) return true; /* this checks permission keys on the VMA: */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return true; return false; } static noinline void bad_area_access_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct mm_struct *mm, struct vm_area_struct *vma) { /* * This OSPKE check is not strictly necessary at runtime. * But, doing it this way allows compiler optimizations * if pkeys are compiled out. */ if (bad_area_access_from_pkeys(error_code, vma)) { /* * A protection key fault means that the PKRU value did not allow * access to some PTE. Userspace can figure out what PKRU was * from the XSAVE state. This function captures the pkey from * the vma and passes it to userspace so userspace can discover * which protection key was set on the PTE. * * If we get here, we know that the hardware signaled a X86_PF_PK * fault and that there was a VMA once we got in the fault * handler. It does *not* guarantee that the VMA we find here * was the one that we faulted on. * * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); * 2. T1 : set PKRU to deny access to pkey=4, touches page * 3. T1 : faults... * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); * 5. T1 : enters fault handler, takes mmap_lock, etc... * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ u32 pkey = vma_pkey(vma); __bad_area(regs, error_code, address, mm, vma, pkey, SEGV_PKUERR); } else { __bad_area(regs, error_code, address, mm, vma, 0, SEGV_ACCERR); } } static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; sanitize_error_code(address, &error_code); if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) return; set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { struct task_struct *tsk = current; unsigned lsb = 0; pr_err( "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return; } #endif force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) return 0; if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) return 0; return 1; } /* * Handle a spurious fault caused by a stale TLB entry. * * This allows us to lazily refresh the TLB when increasing the * permissions of a kernel page (RO -> RW or NX -> X). Doing it * eagerly is very expensive since that implies doing a full * cross-processor TLB flush, even if no stale TLB entries exist * on other processors. * * Spurious faults may only occur if the TLB contains an entry with * fewer permission than the page table entry. Non-present (P = 0) * and reserved bit (R = 1) faults are never spurious. * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. * * Returns non-zero if a spurious fault was handled, zero otherwise. * * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 * (Optional Invalidation). */ static noinline int spurious_kernel_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; int ret; /* * Only writes to RO or instruction fetches from NX may cause * spurious faults. * * These could be from user or supervisor accesses but the TLB * is only lazily flushed after a kernel mapping protection * change, so user accesses are not expected to cause spurious * faults. */ if (error_code != (X86_PF_WRITE | X86_PF_PROT) && error_code != (X86_PF_INSTR | X86_PF_PROT)) return 0; pgd = init_mm.pgd + pgd_index(address); if (!pgd_present(*pgd)) return 0; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) return 0; if (p4d_leaf(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; if (pmd_leaf(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); if (!pte_present(*pte)) return 0; ret = spurious_kernel_fault_check(error_code, pte); if (!ret) return 0; /* * Make sure we have permissions in PMD. * If not, then there's a bug in the page tables: */ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); return ret; } NOKPROBE_SYMBOL(spurious_kernel_fault); int show_unhandled_signals = 1; static inline int access_error(unsigned long error_code, struct vm_area_struct *vma) { /* This is only called for the current mm, so: */ bool foreign = false; /* * Read or write was blocked by protection keys. This is * always an unconditional error and can never result in * a follow-up action to resolve the fault, like a COW. */ if (error_code & X86_PF_PK) return 1; /* * SGX hardware blocked the access. This usually happens * when the enclave memory contents have been destroyed, like * after a suspend/resume cycle. In any case, the kernel can't * fix the cause of the fault. Handle the fault as an access * error even in cases where no actual access violation * occurred. This allows userspace to rebuild the enclave in * response to the signal. */ if (unlikely(error_code & X86_PF_SGX)) return 1; /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. */ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), (error_code & X86_PF_INSTR), foreign)) return 1; /* * Shadow stack accesses (PF_SHSTK=1) are only permitted to * shadow stack VMAs. All other accesses result in an error. */ if (error_code & X86_PF_SHSTK) { if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; } /* read, present: */ if (unlikely(error_code & X86_PF_PROT)) return 1; /* read, not present: */ if (unlikely(!vma_is_accessible(vma))) return 1; return 0; } bool fault_in_kernel_space(unsigned long address) { /* * On 64-bit systems, the vsyscall page is at an address above * TASK_SIZE_MAX, but is not considered part of the kernel * address space. */ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) return false; return address >= TASK_SIZE_MAX; } /* * Called for all faults where 'address' is part of the kernel address * space. Might get called for faults that originate from *code* that * ran in userspace or the kernel. */ static void do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { /* * Protection keys exceptions only happen on user pages. We * have no user pages in the kernel portion of the address * space, so do not expect them here. */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); #ifdef CONFIG_X86_32 /* * We can fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * Before doing this on-demand faulting, ensure that the * fault is not any of the following: * 1. A fault on a PTE with a reserved bit set. * 2. A fault caused by a user-mode access. (Do not demand- * fault kernel memory due to user-mode accesses). * 3. A fault caused by a page-level protection violation. * (A demand fault would be on a non-present page which * would have X86_PF_PROT==0). * * This is only needed to close a race condition on x86-32 in * the vmalloc mapping/unmapping code. See the comment above * vmalloc_fault() for details. On x86-64 the race does not * exist as the vmalloc mappings don't need to be synchronized * there. */ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; } #endif if (is_f00f_bug(regs, hw_error_code, address)) return; /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Note, despite being a "bad area", there are quite a few * acceptable reasons to get here, such as erratum fixups * and handling kernel code that can fault, like get_user(). * * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, hw_error_code, address); } NOKPROBE_SYMBOL(do_kern_addr_fault); /* * Handle faults in the user portion of the address space. Nothing in here * should check X86_PF_USER without a specific justification: for almost * all purposes, we should treat a normal kernel access to user memory * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. * The one exception is AC flag handling, which is, per the x86 * architecture, special for WRUSS. */ static inline void do_user_addr_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; tsk = current; mm = tsk->mm; if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { /* * Whoops, this is kernel mode code trying to execute from * user memory. Unless this is AMD erratum #93, which * corrupts RIP such that it looks like a user address, * this is unrecoverable. Don't even try to look up the * VMA or look for extable entries. */ if (is_errata93(regs, address)) return; page_fault_oops(regs, error_code, address); return; } /* kprobes don't want to hook the spurious faults: */ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* * Reserved bits are never expected to be set on * entries in the user portion of the page tables. */ if (unlikely(error_code & X86_PF_RSVD)) pgtable_bad(regs, error_code, address); /* * If SMAP is on, check for invalid kernel (supervisor) access to user * pages in the user address space. The odd case here is WRUSS, * which, according to the preliminary documentation, does not respect * SMAP and will have the USER bit set so, in all cases, SMAP * enforcement appears to be consistent with the USER bit. */ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && !(error_code & X86_PF_USER) && !(regs->flags & X86_EFLAGS_AC))) { /* * No extable entry here. This was a kernel access to an * invalid pointer. get_kernel_nofault() will not get here. */ page_fault_oops(regs, error_code, address); return; } /* * If we're in an interrupt, have no user context or are running * in a region with pagefaults disabled then we must not take the fault */ if (unlikely(faulthandler_disabled() || !mm)) { bad_area_nosemaphore(regs, error_code, address); return; } /* Legacy check - remove this after verifying that it doesn't trigger */ if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { bad_area_nosemaphore(regs, error_code, address); return; } local_irq_enable(); perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * Read-only permissions can not be expressed in shadow stack PTEs. * Treat all shadow stack accesses as WRITE faults. This ensures * that the MM will prepare everything (e.g., break COW) such that * maybe_mkwrite() can create a proper shadow stack PTE. */ if (error_code & X86_PF_SHSTK) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; /* * We set FAULT_FLAG_USER based on the register state, not * based on X86_PF_USER. User space accesses that cause * system page faults are still user accesses. */ if (user_mode(regs)) flags |= FAULT_FLAG_USER; #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The * vsyscall page is at a high address (>PAGE_OFFSET), but is * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. * * PKRU never rejects instruction fetches, so we don't need * to consider the PF_PK bit. */ if (is_vsyscall_vaddr(address)) { if (emulate_vsyscall(error_code, regs, address)) return; } #endif if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); if (!vma) goto lock_mmap; if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, NULL, vma); count_vm_vma_lock_event(VMA_LOCK_SUCCESS); return; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); goto done; } count_vm_vma_lock_event(VMA_LOCK_RETRY); if (fault & VM_FAULT_MAJOR) flags |= FAULT_FLAG_TRIED; /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } lock_mmap: retry: vma = lock_mm_and_find_vma(mm, address, regs); if (unlikely(!vma)) { bad_area_nosemaphore(regs, error_code, address); return; } /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, mm, vma); return; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. * * Note that handle_userfault() may also release and reacquire mmap_lock * (and not return with VM_FAULT_RETRY), when returning to userland to * repeat the page fault later with a VM_FAULT_NOPAGE retval * (potentially after handling any pending signal during the return to * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { /* * Quick path to respond to signals. The core mm code * has unlocked the mm for us if we get here. */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } /* The fault is fully completed (including releasing mmap lock) */ if (fault & VM_FAULT_COMPLETED) return; /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } mmap_read_unlock(mm); done: if (likely(!(fault & VM_FAULT_ERROR))) return; if (fatal_signal_pending(current) && !user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, 0, 0, ARCH_DEFAULT_PKEY); return; } if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, SIGSEGV, SEGV_MAPERR, ARCH_DEFAULT_PKEY); return; } /* * We ran out of memory, call the OOM killer, and return the * userspace (which will retry the fault, or kill us if we got * oom-killed): */ pagefault_out_of_memory(); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else if (fault & VM_FAULT_SIGSEGV) bad_area_nosemaphore(regs, error_code, address); else BUG(); } } NOKPROBE_SYMBOL(do_user_addr_fault); static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) { if (!trace_pagefault_enabled()) return; if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } static __always_inline void handle_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { do_kern_addr_fault(regs, error_code, address); } else { do_user_addr_fault(regs, error_code, address); /* * User address page fault handling might have reenabled * interrupts. Fixing up all potential exit points of * do_user_addr_fault() and its leaf functions is just not * doable w/o creating an unholy mess or turning the code * upside down. */ local_irq_disable(); } } DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { irqentry_state_t state; unsigned long address; address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(&current->mm->mmap_lock); /* * KVM uses #PF vector to deliver 'page not present' events to guests * (asynchronous page fault mechanism). The event happens when a * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * * We are relying on the interrupted context being sane (valid RSP, * relevant locks not held, etc.), which is fine as long as the * interrupted context had IF=1. We are also relying on the KVM * async pf type field and CR2 being read consistently instead of * getting values from real and async page faults mixed up. * * Fingers crossed. * * The async #PF handling code takes care of idtentry handling * itself. */ if (kvm_handle_async_pf(regs, (u32)address)) return; /* * Entry handling for valid #PF from kernel mode is slightly * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps * debuggability. */ state = irqentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); irqentry_exit(regs, state); }
7 132 131 38 132 21 7 131 50 51 51 51 51 50 51 51 50 51 50 51 51 51 51 51 51 49 15 51 51 50 51 50 50 49 39 41 40 141 10 40 41 41 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/runtime.c - Helper functions for device runtime PM * * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * Copyright (C) 2010 Alan Stern <stern@rowland.harvard.edu> */ #include <linux/sched/mm.h> #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/rculist.h> #include <trace/events/rpm.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset) { pm_callback_t cb; const struct dev_pm_ops *ops; if (dev->pm_domain) ops = &dev->pm_domain->ops; else if (dev->type && dev->type->pm) ops = dev->type->pm; else if (dev->class && dev->class->pm) ops = dev->class->pm; else if (dev->bus && dev->bus->pm) ops = dev->bus->pm; else ops = NULL; if (ops) cb = *(pm_callback_t *)((void *)ops + cb_offset); else cb = NULL; if (!cb && dev->driver && dev->driver->pm) cb = *(pm_callback_t *)((void *)dev->driver->pm + cb_offset); return cb; } #define RPM_GET_CALLBACK(dev, callback) \ __rpm_get_callback(dev, offsetof(struct dev_pm_ops, callback)) static int rpm_resume(struct device *dev, int rpmflags); static int rpm_suspend(struct device *dev, int rpmflags); /** * update_pm_runtime_accounting - Update the time accounting of power states * @dev: Device to update the accounting for * * In order to be able to have time accounting of the various power states * (as used by programs such as PowerTOP to show the effectiveness of runtime * PM), we need to track the time spent in each state. * update_pm_runtime_accounting must be called each time before the * runtime_status field is updated, to account the time in the old state * correctly. */ static void update_pm_runtime_accounting(struct device *dev) { u64 now, last, delta; if (dev->power.disable_depth > 0) return; last = dev->power.accounting_timestamp; now = ktime_get_mono_fast_ns(); dev->power.accounting_timestamp = now; /* * Because ktime_get_mono_fast_ns() is not monotonic during * timekeeping updates, ensure that 'now' is after the last saved * timesptamp. */ if (now < last) return; delta = now - last; if (dev->power.runtime_status == RPM_SUSPENDED) dev->power.suspended_time += delta; else dev->power.active_time += delta; } static void __update_runtime_status(struct device *dev, enum rpm_status status) { update_pm_runtime_accounting(dev); trace_rpm_status(dev, status); dev->power.runtime_status = status; } static u64 rpm_get_accounted_time(struct device *dev, bool suspended) { u64 time; unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); update_pm_runtime_accounting(dev); time = suspended ? dev->power.suspended_time : dev->power.active_time; spin_unlock_irqrestore(&dev->power.lock, flags); return time; } u64 pm_runtime_active_time(struct device *dev) { return rpm_get_accounted_time(dev, false); } u64 pm_runtime_suspended_time(struct device *dev) { return rpm_get_accounted_time(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_suspended_time); /** * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. * @dev: Device to handle. */ static void pm_runtime_deactivate_timer(struct device *dev) { if (dev->power.timer_expires > 0) { hrtimer_try_to_cancel(&dev->power.suspend_timer); dev->power.timer_expires = 0; } } /** * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. * @dev: Device to handle. */ static void pm_runtime_cancel_pending(struct device *dev) { pm_runtime_deactivate_timer(dev); /* * In case there's a request pending, make sure its work function will * return without doing anything. */ dev->power.request = RPM_REQ_NONE; } /* * pm_runtime_autosuspend_expiration - Get a device's autosuspend-delay expiration time. * @dev: Device to handle. * * Compute the autosuspend-delay expiration time based on the device's * power.last_busy time. If the delay has already expired or is disabled * (negative) or the power.use_autosuspend flag isn't set, return 0. * Otherwise return the expiration time in nanoseconds (adjusted to be nonzero). * * This function may be called either with or without dev->power.lock held. * Either way it can be racy, since power.last_busy may be updated at any time. */ u64 pm_runtime_autosuspend_expiration(struct device *dev) { int autosuspend_delay; u64 expires; if (!dev->power.use_autosuspend) return 0; autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay); if (autosuspend_delay < 0) return 0; expires = READ_ONCE(dev->power.last_busy); expires += (u64)autosuspend_delay * NSEC_PER_MSEC; if (expires > ktime_get_mono_fast_ns()) return expires; /* Expires in the future */ return 0; } EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); static int dev_memalloc_noio(struct device *dev, void *data) { return dev->power.memalloc_noio; } /* * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag. * @dev: Device to handle. * @enable: True for setting the flag and False for clearing the flag. * * Set the flag for all devices in the path from the device to the * root device in the device tree if @enable is true, otherwise clear * the flag for devices in the path whose siblings don't set the flag. * * The function should only be called by block device, or network * device driver for solving the deadlock problem during runtime * resume/suspend: * * If memory allocation with GFP_KERNEL is called inside runtime * resume/suspend callback of any one of its ancestors(or the * block device itself), the deadlock may be triggered inside the * memory allocation since it might not complete until the block * device becomes active and the involed page I/O finishes. The * situation is pointed out first by Alan Stern. Network device * are involved in iSCSI kind of situation. * * The lock of dev_hotplug_mutex is held in the function for handling * hotplug race because pm_runtime_set_memalloc_noio() may be called * in async probe(). * * The function should be called between device_add() and device_del() * on the affected device(block/network device). */ void pm_runtime_set_memalloc_noio(struct device *dev, bool enable) { static DEFINE_MUTEX(dev_hotplug_mutex); mutex_lock(&dev_hotplug_mutex); for (;;) { bool enabled; /* hold power lock since bitfield is not SMP-safe. */ spin_lock_irq(&dev->power.lock); enabled = dev->power.memalloc_noio; dev->power.memalloc_noio = enable; spin_unlock_irq(&dev->power.lock); /* * not need to enable ancestors any more if the device * has been enabled. */ if (enabled && enable) break; dev = dev->parent; /* * clear flag of the parent device only if all the * children don't set the flag because ancestor's * flag was set by any one of the descendants. */ if (!dev || (!enable && device_for_each_child(dev, NULL, dev_memalloc_noio))) break; } mutex_unlock(&dev_hotplug_mutex); } EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio); /** * rpm_check_suspend_allowed - Test whether a device may be suspended. * @dev: Device to test. */ static int rpm_check_suspend_allowed(struct device *dev) { int retval = 0; if (dev->power.runtime_error) retval = -EINVAL; else if (dev->power.disable_depth > 0) retval = -EACCES; else if (atomic_read(&dev->power.usage_count)) retval = -EAGAIN; else if (!dev->power.ignore_children && atomic_read(&dev->power.child_count)) retval = -EBUSY; /* Pending resume requests take precedence over suspends. */ else if ((dev->power.deferred_resume && dev->power.runtime_status == RPM_SUSPENDING) || (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME)) retval = -EAGAIN; else if (__dev_pm_qos_resume_latency(dev) == 0) retval = -EPERM; else if (dev->power.runtime_status == RPM_SUSPENDED) retval = 1; return retval; } static int rpm_get_suppliers(struct device *dev) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { int retval; if (!(link->flags & DL_FLAG_PM_RUNTIME)) continue; retval = pm_runtime_get_sync(link->supplier); /* Ignore suppliers with disabled runtime PM. */ if (retval < 0 && retval != -EACCES) { pm_runtime_put_noidle(link->supplier); return retval; } refcount_inc(&link->rpm_active); } return 0; } /** * pm_runtime_release_supplier - Drop references to device link's supplier. * @link: Target device link. * * Drop all runtime PM references associated with @link to its supplier device. */ void pm_runtime_release_supplier(struct device_link *link) { struct device *supplier = link->supplier; /* * The additional power.usage_count check is a safety net in case * the rpm_active refcount becomes saturated, in which case * refcount_dec_not_one() would return true forever, but it is not * strictly necessary. */ while (refcount_dec_not_one(&link->rpm_active) && atomic_read(&supplier->power.usage_count) > 0) pm_runtime_put_noidle(supplier); } static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { pm_runtime_release_supplier(link); if (try_to_suspend) pm_request_idle(link->supplier); } } static void rpm_put_suppliers(struct device *dev) { __rpm_put_suppliers(dev, true); } static void rpm_suspend_suppliers(struct device *dev) { struct device_link *link; int idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) pm_request_idle(link->supplier); device_links_read_unlock(idx); } /** * __rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int __rpm_callback(int (*cb)(struct device *), struct device *dev) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int retval = 0, idx; bool use_links = dev->power.links_count > 0; if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); } else { spin_unlock_irq(&dev->power.lock); /* * Resume suppliers if necessary. * * The device's runtime PM status cannot change until this * routine returns, so it is safe to read the status outside of * the lock. */ if (use_links && dev->power.runtime_status == RPM_RESUMING) { idx = device_links_read_lock(); retval = rpm_get_suppliers(dev); if (retval) { rpm_put_suppliers(dev); goto fail; } device_links_read_unlock(idx); } } if (cb) retval = cb(dev); if (dev->power.irq_safe) { spin_lock(&dev->power.lock); } else { /* * If the device is suspending and the callback has returned * success, drop the usage counters of the suppliers that have * been reference counted on its resume. * * Do that if resume fails too. */ if (use_links && ((dev->power.runtime_status == RPM_SUSPENDING && !retval) || (dev->power.runtime_status == RPM_RESUMING && retval))) { idx = device_links_read_lock(); __rpm_put_suppliers(dev, false); fail: device_links_read_unlock(idx); } spin_lock_irq(&dev->power.lock); } return retval; } /** * rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int rpm_callback(int (*cb)(struct device *), struct device *dev) { int retval; if (dev->power.memalloc_noio) { unsigned int noio_flag; /* * Deadlock might be caused if memory allocation with * GFP_KERNEL happens inside runtime_suspend and * runtime_resume callbacks of one block device's * ancestor or the block device itself. Network * device might be thought as part of iSCSI block * device, so network device and its ancestor should * be marked as memalloc_noio too. */ noio_flag = memalloc_noio_save(); retval = __rpm_callback(cb, dev); memalloc_noio_restore(noio_flag); } else { retval = __rpm_callback(cb, dev); } dev->power.runtime_error = retval; return retval != -EACCES ? retval : -EIO; } /** * rpm_idle - Notify device bus type if the device can be suspended. * @dev: Device to notify the bus type about. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. If * another idle notification has been started earlier, return immediately. If * the RPM_ASYNC flag is set then queue an idle-notification request; otherwise * run the ->runtime_idle() callback directly. If the ->runtime_idle callback * doesn't exist or if it returns 0, call rpm_suspend with the RPM_AUTO flag. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_idle(struct device *dev, int rpmflags) { int (*callback)(struct device *); int retval; trace_rpm_idle(dev, rpmflags); retval = rpm_check_suspend_allowed(dev); if (retval < 0) ; /* Conditions are wrong. */ /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; /* * Any pending request other than an idle notification takes * precedence over us, except that the timer may be running. */ else if (dev->power.request_pending && dev->power.request > RPM_REQ_IDLE) retval = -EAGAIN; /* Act as though RPM_NOWAIT is always set. */ else if (dev->power.idle_notification) retval = -EINPROGRESS; if (retval) goto out; /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; callback = RPM_GET_CALLBACK(dev, runtime_idle); /* If no callback assume success. */ if (!callback || dev->power.no_callbacks) goto out; /* Carry out an asynchronous or a synchronous idle notification. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_IDLE; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } trace_rpm_return_int(dev, _THIS_IP_, 0); return 0; } dev->power.idle_notification = true; if (dev->power.irq_safe) spin_unlock(&dev->power.lock); else spin_unlock_irq(&dev->power.lock); retval = callback(dev); if (dev->power.irq_safe) spin_lock(&dev->power.lock); else spin_lock_irq(&dev->power.lock); dev->power.idle_notification = false; wake_up_all(&dev->power.wait_queue); out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO); } /** * rpm_suspend - Carry out runtime suspend of given device. * @dev: Device to suspend. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. * Cancel a pending idle notification, autosuspend or suspend. If * another suspend has been started earlier, either return immediately * or wait for it to finish, depending on the RPM_NOWAIT and RPM_ASYNC * flags. If the RPM_ASYNC flag is set then queue a suspend request; * otherwise run the ->runtime_suspend() callback directly. When * ->runtime_suspend succeeded, if a deferred resume was requested while * the callback was running then carry it out, otherwise send an idle * notification for its parent (if the suspend succeeded and both * ignore_children of parent->power and irq_safe of dev->power are not set). * If ->runtime_suspend failed with -EAGAIN or -EBUSY, and if the RPM_AUTO * flag is set and the next autosuspend-delay expiration time is in the * future, schedule another autosuspend attempt. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_suspend(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval; trace_rpm_suspend(dev, rpmflags); repeat: retval = rpm_check_suspend_allowed(dev); if (retval < 0) goto out; /* Conditions are wrong. */ /* Synchronous suspends are not allowed in the RPM_RESUMING state. */ if (dev->power.runtime_status == RPM_RESUMING && !(rpmflags & RPM_ASYNC)) retval = -EAGAIN; if (retval) goto out; /* If the autosuspend_delay time hasn't expired yet, reschedule. */ if ((rpmflags & RPM_AUTO) && dev->power.runtime_status != RPM_SUSPENDING) { u64 expires = pm_runtime_autosuspend_expiration(dev); if (expires != 0) { /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; /* * Optimization: If the timer is already running and is * set to expire at or before the autosuspend delay, * avoid the overhead of resetting it. Just let it * expire; pm_suspend_timer_fn() will take care of the * rest. */ if (!(dev->power.timer_expires && dev->power.timer_expires <= expires)) { /* * We add a slack of 25% to gather wakeups * without sacrificing the granularity. */ u64 slack = (u64)READ_ONCE(dev->power.autosuspend_delay) * (NSEC_PER_MSEC >> 2); dev->power.timer_expires = expires; hrtimer_start_range_ns(&dev->power.suspend_timer, ns_to_ktime(expires), slack, HRTIMER_MODE_ABS); } dev->power.timer_autosuspends = 1; goto out; } } /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); if (dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { retval = -EINPROGRESS; goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the other suspend running in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ /* Carry out an asynchronous or a synchronous suspend. */ if (rpmflags & RPM_ASYNC) { dev->power.request = (rpmflags & RPM_AUTO) ? RPM_REQ_AUTOSUSPEND : RPM_REQ_SUSPEND; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } goto out; } __update_runtime_status(dev, RPM_SUSPENDING); callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); retval = rpm_callback(callback, dev); if (retval) goto fail; dev_pm_enable_wake_irq_complete(dev); no_callback: __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); if (dev->parent) { parent = dev->parent; atomic_add_unless(&parent->power.child_count, -1, 0); } wake_up_all(&dev->power.wait_queue); if (dev->power.deferred_resume) { dev->power.deferred_resume = false; rpm_resume(dev, 0); retval = -EAGAIN; goto out; } if (dev->power.irq_safe) goto out; /* Maybe the parent is now able to suspend. */ if (parent && !parent->power.ignore_children) { spin_unlock(&dev->power.lock); spin_lock(&parent->power.lock); rpm_idle(parent, RPM_ASYNC); spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); } /* Maybe the suppliers are now able to suspend. */ if (dev->power.links_count > 0) { spin_unlock_irq(&dev->power.lock); rpm_suspend_suppliers(dev); spin_lock_irq(&dev->power.lock); } out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; fail: dev_pm_disable_wake_irq_check(dev, true); __update_runtime_status(dev, RPM_ACTIVE); dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); if (retval == -EAGAIN || retval == -EBUSY) { dev->power.runtime_error = 0; /* * If the callback routine failed an autosuspend, and * if the last_busy time has been updated so that there * is a new autosuspend expiration time, automatically * reschedule another autosuspend. */ if ((rpmflags & RPM_AUTO) && pm_runtime_autosuspend_expiration(dev) != 0) goto repeat; } else { pm_runtime_cancel_pending(dev); } goto out; } /** * rpm_resume - Carry out runtime resume of given device. * @dev: Device to resume. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be resumed. Cancel * any scheduled or pending requests. If another resume has been started * earlier, either return immediately or wait for it to finish, depending on the * RPM_NOWAIT and RPM_ASYNC flags. Similarly, if there's a suspend running in * parallel with this function, either tell the other process to resume after * suspending (deferred_resume) or wait for it to finish. If the RPM_ASYNC * flag is set then queue a resume request; otherwise run the * ->runtime_resume() callback directly. Queue an idle notification for the * device if the resume succeeded. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_resume(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval = 0; trace_rpm_resume(dev, rpmflags); repeat: if (dev->power.runtime_error) { retval = -EINVAL; } else if (dev->power.disable_depth > 0) { if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; else retval = -EACCES; } if (retval) goto out; /* * Other scheduled or pending requests need to be canceled. Small * optimization: If an autosuspend timer is running, leave it running * rather than cancelling it now only to restart it again in the near * future. */ dev->power.request = RPM_REQ_NONE; if (!dev->power.timer_autosuspends) pm_runtime_deactivate_timer(dev); if (dev->power.runtime_status == RPM_ACTIVE) { retval = 1; goto out; } if (dev->power.runtime_status == RPM_RESUMING || dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { if (dev->power.runtime_status == RPM_SUSPENDING) { dev->power.deferred_resume = true; if (rpmflags & RPM_NOWAIT) retval = -EINPROGRESS; } else { retval = -EINPROGRESS; } goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the operation carried out in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_RESUMING && dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } /* * See if we can skip waking up the parent. This is safe only if * power.no_callbacks is set, because otherwise we don't know whether * the resume will actually succeed. */ if (dev->power.no_callbacks && !parent && dev->parent) { spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING); if (dev->parent->power.disable_depth > 0 || dev->parent->power.ignore_children || dev->parent->power.runtime_status == RPM_ACTIVE) { atomic_inc(&dev->parent->power.child_count); spin_unlock(&dev->parent->power.lock); retval = 1; goto no_callback; /* Assume success. */ } spin_unlock(&dev->parent->power.lock); } /* Carry out an asynchronous or a synchronous resume. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_RESUME; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } retval = 0; goto out; } if (!parent && dev->parent) { /* * Increment the parent's usage counter and resume it if * necessary. Not needed if dev is irq-safe; then the * parent is permanently resumed. */ parent = dev->parent; if (dev->power.irq_safe) goto skip_parent; spin_unlock(&dev->power.lock); pm_runtime_get_noresume(parent); spin_lock(&parent->power.lock); /* * Resume the parent if it has runtime PM enabled and not been * set to ignore its children. */ if (!parent->power.disable_depth && !parent->power.ignore_children) { rpm_resume(parent, 0); if (parent->power.runtime_status != RPM_ACTIVE) retval = -EBUSY; } spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); if (retval) goto out; goto repeat; } skip_parent: if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ __update_runtime_status(dev, RPM_RESUMING); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); dev_pm_enable_wake_irq_check(dev, false); } else { no_callback: __update_runtime_status(dev, RPM_ACTIVE); pm_runtime_mark_last_busy(dev); if (parent) atomic_inc(&parent->power.child_count); } wake_up_all(&dev->power.wait_queue); if (retval >= 0) rpm_idle(dev, RPM_ASYNC); out: if (parent && !dev->power.irq_safe) { spin_unlock_irq(&dev->power.lock); pm_runtime_put(parent); spin_lock_irq(&dev->power.lock); } trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; } /** * pm_runtime_work - Universal runtime PM work function. * @work: Work structure used for scheduling the execution of this function. * * Use @work to get the device object the work is to be done for, determine what * is to be done and execute the appropriate runtime PM function. */ static void pm_runtime_work(struct work_struct *work) { struct device *dev = container_of(work, struct device, power.work); enum rpm_request req; spin_lock_irq(&dev->power.lock); if (!dev->power.request_pending) goto out; req = dev->power.request; dev->power.request = RPM_REQ_NONE; dev->power.request_pending = false; switch (req) { case RPM_REQ_NONE: break; case RPM_REQ_IDLE: rpm_idle(dev, RPM_NOWAIT); break; case RPM_REQ_SUSPEND: rpm_suspend(dev, RPM_NOWAIT); break; case RPM_REQ_AUTOSUSPEND: rpm_suspend(dev, RPM_NOWAIT | RPM_AUTO); break; case RPM_REQ_RESUME: rpm_resume(dev, RPM_NOWAIT); break; } out: spin_unlock_irq(&dev->power.lock); } /** * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). * @timer: hrtimer used by pm_schedule_suspend(). * * Check if the time is right and queue a suspend request. */ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer) { struct device *dev = container_of(timer, struct device, power.suspend_timer); unsigned long flags; u64 expires; spin_lock_irqsave(&dev->power.lock, flags); expires = dev->power.timer_expires; /* * If 'expires' is after the current time, we've been called * too early. */ if (expires > 0 && expires < ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(dev, dev->power.timer_autosuspends ? (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); } spin_unlock_irqrestore(&dev->power.lock, flags); return HRTIMER_NORESTART; } /** * pm_schedule_suspend - Set up a timer to submit a suspend request in future. * @dev: Device to suspend. * @delay: Time to wait before submitting a suspend request, in milliseconds. */ int pm_schedule_suspend(struct device *dev, unsigned int delay) { unsigned long flags; u64 expires; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (!delay) { retval = rpm_suspend(dev, RPM_ASYNC); goto out; } retval = rpm_check_suspend_allowed(dev); if (retval) goto out; /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); expires = ktime_get_mono_fast_ns() + (u64)delay * NSEC_PER_MSEC; dev->power.timer_expires = expires; dev->power.timer_autosuspends = 0; hrtimer_start(&dev->power.suspend_timer, expires, HRTIMER_MODE_ABS); out: spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(pm_schedule_suspend); static int rpm_drop_usage_count(struct device *dev) { int ret; ret = atomic_sub_return(1, &dev->power.usage_count); if (ret >= 0) return ret; /* * Because rpm_resume() does not check the usage counter, it will resume * the device even if the usage counter is 0 or negative, so it is * sufficient to increment the usage counter here to reverse the change * made above. */ atomic_inc(&dev->power.usage_count); dev_warn(dev, "Runtime PM usage count underflow!\n"); return -EINVAL; } /** * __pm_runtime_idle - Entry point for runtime idle operations. * @dev: Device to send idle notification for. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out an idle * notification, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_idle(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_idle(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_idle); /** * __pm_runtime_suspend - Entry point for runtime put/suspend operations. * @dev: Device to suspend. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out a suspend, * either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_suspend(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_suspend(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_suspend); /** * __pm_runtime_resume - Entry point for runtime resume operations. * @dev: Device to resume. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, increment the device's usage count. Then * carry out a resume, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_resume(struct device *dev, int rpmflags) { unsigned long flags; int retval; might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe && dev->power.runtime_status != RPM_ACTIVE); if (rpmflags & RPM_GET_PUT) atomic_inc(&dev->power.usage_count); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_resume(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_resume); /** * pm_runtime_get_conditional - Conditionally bump up device usage counter. * @dev: Device to handle. * @ign_usage_count: Whether or not to look at the current usage counter value. * * Return -EINVAL if runtime PM is disabled for @dev. * * Otherwise, if the runtime PM status of @dev is %RPM_ACTIVE and either * @ign_usage_count is %true or the runtime PM usage counter of @dev is not * zero, increment the usage counter of @dev and return 1. Otherwise, return 0 * without changing the usage counter. * * If @ign_usage_count is %true, this function can be used to prevent suspending * the device when its runtime PM status is %RPM_ACTIVE. * * If @ign_usage_count is %false, this function can be used to prevent * suspending the device when both its runtime PM status is %RPM_ACTIVE and its * runtime PM usage counter is not zero. * * The caller is responsible for decrementing the runtime PM usage counter of * @dev after this function has returned a positive value for it. */ static int pm_runtime_get_conditional(struct device *dev, bool ign_usage_count) { unsigned long flags; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.disable_depth > 0) { retval = -EINVAL; } else if (dev->power.runtime_status != RPM_ACTIVE) { retval = 0; } else if (ign_usage_count) { retval = 1; atomic_inc(&dev->power.usage_count); } else { retval = atomic_inc_not_zero(&dev->power.usage_count); } trace_rpm_usage(dev, 0); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } /** * pm_runtime_get_if_active - Bump up runtime PM usage counter if the device is * in active state * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE, in which case it returns 1. If the device is in a different * state, 0 is returned. -EINVAL is returned if runtime PM is disabled for the * device, in which case also the usage_count will remain unmodified. */ int pm_runtime_get_if_active(struct device *dev) { return pm_runtime_get_conditional(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_get_if_active); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE and its runtime PM usage counter is greater than 0, in which case * it returns 1. If the device is in a different state or its usage_count is 0, * 0 is returned. -EINVAL is returned if runtime PM is disabled for the device, * in which case also the usage_count will remain unmodified. */ int pm_runtime_get_if_in_use(struct device *dev) { return pm_runtime_get_conditional(dev, false); } EXPORT_SYMBOL_GPL(pm_runtime_get_if_in_use); /** * __pm_runtime_set_status - Set runtime PM status of a device. * @dev: Device to handle. * @status: New runtime PM status of the device. * * If runtime PM of the device is disabled or its power.runtime_error field is * different from zero, the status may be changed either to RPM_ACTIVE, or to * RPM_SUSPENDED, as long as that reflects the actual state of the device. * However, if the device has a parent and the parent is not active, and the * parent's power.ignore_children flag is unset, the device's status cannot be * set to RPM_ACTIVE, so -EBUSY is returned in that case. * * If successful, __pm_runtime_set_status() clears the power.runtime_error field * and the device parent's counter of unsuspended children is modified to * reflect the new status. If the new status is RPM_SUSPENDED, an idle * notification request for the parent is submitted. * * If @dev has any suppliers (as reflected by device links to them), and @status * is RPM_ACTIVE, they will be activated upfront and if the activation of one * of them fails, the status of @dev will be changed to RPM_SUSPENDED (instead * of the @status value) and the suppliers will be deacticated on exit. The * error returned by the failing supplier activation will be returned in that * case. */ int __pm_runtime_set_status(struct device *dev, unsigned int status) { struct device *parent = dev->parent; bool notify_parent = false; unsigned long flags; int error = 0; if (status != RPM_ACTIVE && status != RPM_SUSPENDED) return -EINVAL; spin_lock_irqsave(&dev->power.lock, flags); /* * Prevent PM-runtime from being enabled for the device or return an * error if it is enabled already and working. */ if (dev->power.runtime_error || dev->power.disable_depth) dev->power.disable_depth++; else error = -EAGAIN; spin_unlock_irqrestore(&dev->power.lock, flags); if (error) return error; /* * If the new status is RPM_ACTIVE, the suppliers can be activated * upfront regardless of the current status, because next time * rpm_put_suppliers() runs, the rpm_active refcounts of the links * involved will be dropped down to one anyway. */ if (status == RPM_ACTIVE) { int idx = device_links_read_lock(); error = rpm_get_suppliers(dev); if (error) status = RPM_SUSPENDED; device_links_read_unlock(idx); } spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.runtime_status == status || !parent) goto out_set; if (status == RPM_SUSPENDED) { atomic_add_unless(&parent->power.child_count, -1, 0); notify_parent = !parent->power.ignore_children; } else { spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING); /* * It is invalid to put an active child under a parent that is * not active, has runtime PM enabled and the * 'power.ignore_children' flag unset. */ if (!parent->power.disable_depth && !parent->power.ignore_children && parent->power.runtime_status != RPM_ACTIVE) { dev_err(dev, "runtime PM trying to activate child device %s but parent (%s) is not active\n", dev_name(dev), dev_name(parent)); error = -EBUSY; } else if (dev->power.runtime_status == RPM_SUSPENDED) { atomic_inc(&parent->power.child_count); } spin_unlock(&parent->power.lock); if (error) { status = RPM_SUSPENDED; goto out; } } out_set: __update_runtime_status(dev, status); if (!error) dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); if (notify_parent) pm_request_idle(parent); if (status == RPM_SUSPENDED) { int idx = device_links_read_lock(); rpm_put_suppliers(dev); device_links_read_unlock(idx); } pm_runtime_enable(dev); return error; } EXPORT_SYMBOL_GPL(__pm_runtime_set_status); /** * __pm_runtime_barrier - Cancel pending requests and wait for completions. * @dev: Device to handle. * * Flush all pending requests for the device from pm_wq and wait for all * runtime PM operations involving the device in progress to complete. * * Should be called under dev->power.lock with interrupts disabled. */ static void __pm_runtime_barrier(struct device *dev) { pm_runtime_deactivate_timer(dev); if (dev->power.request_pending) { dev->power.request = RPM_REQ_NONE; spin_unlock_irq(&dev->power.lock); cancel_work_sync(&dev->power.work); spin_lock_irq(&dev->power.lock); dev->power.request_pending = false; } if (dev->power.runtime_status == RPM_SUSPENDING || dev->power.runtime_status == RPM_RESUMING || dev->power.idle_notification) { DEFINE_WAIT(wait); /* Suspend, wake-up or idle notification in progress. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING && dev->power.runtime_status != RPM_RESUMING && !dev->power.idle_notification) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); } } /** * pm_runtime_barrier - Flush pending requests and wait for completions. * @dev: Device to handle. * * Prevent the device from being suspended by incrementing its usage counter and * if there's a pending resume request for the device, wake the device up. * Next, make sure that all pending requests for the device have been flushed * from pm_wq and wait for all runtime PM operations involving the device in * progress to complete. * * Return value: * 1, if there was a resume request pending and the device had to be woken up, * 0, otherwise */ int pm_runtime_barrier(struct device *dev) { int retval = 0; pm_runtime_get_noresume(dev); spin_lock_irq(&dev->power.lock); if (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { rpm_resume(dev, 0); retval = 1; } __pm_runtime_barrier(dev); spin_unlock_irq(&dev->power.lock); pm_runtime_put_noidle(dev); return retval; } EXPORT_SYMBOL_GPL(pm_runtime_barrier); /** * __pm_runtime_disable - Disable runtime PM of a device. * @dev: Device to handle. * @check_resume: If set, check if there's a resume request for the device. * * Increment power.disable_depth for the device and if it was zero previously, * cancel all pending runtime PM requests for the device and wait for all * operations in progress to complete. The device can be either active or * suspended after its runtime PM has been disabled. * * If @check_resume is set and there's a resume request pending when * __pm_runtime_disable() is called and power.disable_depth is zero, the * function will wake up the device before disabling its runtime PM. */ void __pm_runtime_disable(struct device *dev, bool check_resume) { spin_lock_irq(&dev->power.lock); if (dev->power.disable_depth > 0) { dev->power.disable_depth++; goto out; } /* * Wake up the device if there's a resume request pending, because that * means there probably is some I/O to process and disabling runtime PM * shouldn't prevent the device from processing the I/O. */ if (check_resume && dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { /* * Prevent suspends and idle notifications from being carried * out after we have woken up the device. */ pm_runtime_get_noresume(dev); rpm_resume(dev, 0); pm_runtime_put_noidle(dev); } /* Update time accounting before disabling PM-runtime. */ update_pm_runtime_accounting(dev); if (!dev->power.disable_depth++) { __pm_runtime_barrier(dev); dev->power.last_status = dev->power.runtime_status; } out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_disable); /** * pm_runtime_enable - Enable runtime PM of a device. * @dev: Device to handle. */ void pm_runtime_enable(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); if (!dev->power.disable_depth) { dev_warn(dev, "Unbalanced %s!\n", __func__); goto out; } if (--dev->power.disable_depth > 0) goto out; dev->power.last_status = RPM_INVALID; dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); if (dev->power.runtime_status == RPM_SUSPENDED && !dev->power.ignore_children && atomic_read(&dev->power.child_count) > 0) dev_warn(dev, "Enabling runtime PM for inactive device with active children\n"); out: spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_runtime_enable); static void pm_runtime_disable_action(void *data) { pm_runtime_dont_use_autosuspend(data); pm_runtime_disable(data); } /** * devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable. * * NOTE: this will also handle calling pm_runtime_dont_use_autosuspend() for * you at driver exit time if needed. * * @dev: Device to handle. */ int devm_pm_runtime_enable(struct device *dev) { pm_runtime_enable(dev); return devm_add_action_or_reset(dev, pm_runtime_disable_action, dev); } EXPORT_SYMBOL_GPL(devm_pm_runtime_enable); /** * pm_runtime_forbid - Block runtime PM of a device. * @dev: Device to handle. * * Increase the device's usage count and clear its power.runtime_auto flag, * so that it cannot be suspended at run time until pm_runtime_allow() is called * for it. */ void pm_runtime_forbid(struct device *dev) { spin_lock_irq(&dev->power.lock); if (!dev->power.runtime_auto) goto out; dev->power.runtime_auto = false; atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_forbid); /** * pm_runtime_allow - Unblock runtime PM of a device. * @dev: Device to handle. * * Decrease the device's usage count and set its power.runtime_auto flag. */ void pm_runtime_allow(struct device *dev) { int ret; spin_lock_irq(&dev->power.lock); if (dev->power.runtime_auto) goto out; dev->power.runtime_auto = true; ret = rpm_drop_usage_count(dev); if (ret == 0) rpm_idle(dev, RPM_AUTO | RPM_ASYNC); else if (ret > 0) trace_rpm_usage(dev, RPM_AUTO | RPM_ASYNC); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_allow); /** * pm_runtime_no_callbacks - Ignore runtime PM callbacks for a device. * @dev: Device to handle. * * Set the power.no_callbacks flag, which tells the PM core that this * device is power-managed through its parent and has no runtime PM * callbacks of its own. The runtime sysfs attributes will be removed. */ void pm_runtime_no_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_callbacks = 1; spin_unlock_irq(&dev->power.lock); if (device_is_registered(dev)) rpm_sysfs_remove(dev); } EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks); /** * pm_runtime_irq_safe - Leave interrupts disabled during callbacks. * @dev: Device to handle * * Set the power.irq_safe flag, which tells the PM core that the * ->runtime_suspend() and ->runtime_resume() callbacks for this device should * always be invoked with the spinlock held and interrupts disabled. It also * causes the parent's usage counter to be permanently incremented, preventing * the parent from runtime suspending -- otherwise an irq-safe child might have * to wait for a non-irq-safe parent. */ void pm_runtime_irq_safe(struct device *dev) { if (dev->parent) pm_runtime_get_sync(dev->parent); spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 1; spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_irq_safe); /** * update_autosuspend - Handle a change to a device's autosuspend settings. * @dev: Device to handle. * @old_delay: The former autosuspend_delay value. * @old_use: The former use_autosuspend value. * * Prevent runtime suspend if the new delay is negative and use_autosuspend is * set; otherwise allow it. Send an idle notification if suspends are allowed. * * This function must be called under dev->power.lock with interrupts disabled. */ static void update_autosuspend(struct device *dev, int old_delay, int old_use) { int delay = dev->power.autosuspend_delay; /* Should runtime suspend be prevented now? */ if (dev->power.use_autosuspend && delay < 0) { /* If it used to be allowed then prevent it. */ if (!old_use || old_delay >= 0) { atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); } else { trace_rpm_usage(dev, 0); } } /* Runtime suspend should be allowed now. */ else { /* If it used to be prevented then allow it. */ if (old_use && old_delay < 0) atomic_dec(&dev->power.usage_count); /* Maybe we can autosuspend now. */ rpm_idle(dev, RPM_AUTO); } } /** * pm_runtime_set_autosuspend_delay - Set a device's autosuspend_delay value. * @dev: Device to handle. * @delay: Value of the new delay in milliseconds. * * Set the device's power.autosuspend_delay value. If it changes to negative * and the power.use_autosuspend flag is set, prevent runtime suspends. If it * changes the other way, allow runtime suspends. */ void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.autosuspend_delay = delay; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_set_autosuspend_delay); /** * __pm_runtime_use_autosuspend - Set a device's use_autosuspend flag. * @dev: Device to handle. * @use: New value for use_autosuspend. * * Set the device's power.use_autosuspend flag, and allow or prevent runtime * suspends as needed. */ void __pm_runtime_use_autosuspend(struct device *dev, bool use) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.use_autosuspend = use; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend); /** * pm_runtime_init - Initialize runtime PM fields in given device object. * @dev: Device object to initialize. */ void pm_runtime_init(struct device *dev) { dev->power.runtime_status = RPM_SUSPENDED; dev->power.last_status = RPM_INVALID; dev->power.idle_notification = false; dev->power.disable_depth = 1; atomic_set(&dev->power.usage_count, 0); dev->power.runtime_error = 0; atomic_set(&dev->power.child_count, 0); pm_suspend_ignore_children(dev, false); dev->power.runtime_auto = true; dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; dev->power.needs_force_resume = 0; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; hrtimer_init(&dev->power.suspend_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); dev->power.suspend_timer.function = pm_suspend_timer_fn; init_waitqueue_head(&dev->power.wait_queue); } /** * pm_runtime_reinit - Re-initialize runtime PM fields in given device object. * @dev: Device object to re-initialize. */ void pm_runtime_reinit(struct device *dev) { if (!pm_runtime_enabled(dev)) { if (dev->power.runtime_status == RPM_ACTIVE) pm_runtime_set_suspended(dev); if (dev->power.irq_safe) { spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 0; spin_unlock_irq(&dev->power.lock); if (dev->parent) pm_runtime_put(dev->parent); } } } /** * pm_runtime_remove - Prepare for removing a device from device hierarchy. * @dev: Device object being removed from device hierarchy. */ void pm_runtime_remove(struct device *dev) { __pm_runtime_disable(dev, false); pm_runtime_reinit(dev); } /** * pm_runtime_get_suppliers - Resume and reference-count supplier devices. * @dev: Consumer device. */ void pm_runtime_get_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->flags & DL_FLAG_PM_RUNTIME) { link->supplier_preactivated = true; pm_runtime_get_sync(link->supplier); } device_links_read_unlock(idx); } /** * pm_runtime_put_suppliers - Drop references to supplier devices. * @dev: Consumer device. */ void pm_runtime_put_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->supplier_preactivated) { link->supplier_preactivated = false; pm_runtime_put(link->supplier); } device_links_read_unlock(idx); } void pm_runtime_new_link(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.links_count++; spin_unlock_irq(&dev->power.lock); } static void pm_runtime_drop_link_count(struct device *dev) { spin_lock_irq(&dev->power.lock); WARN_ON(dev->power.links_count == 0); dev->power.links_count--; spin_unlock_irq(&dev->power.lock); } /** * pm_runtime_drop_link - Prepare for device link removal. * @link: Device link going away. * * Drop the link count of the consumer end of @link and decrement the supplier * device's runtime PM usage counter as many times as needed to drop all of the * PM runtime reference to it from the consumer. */ void pm_runtime_drop_link(struct device_link *link) { if (!(link->flags & DL_FLAG_PM_RUNTIME)) return; pm_runtime_drop_link_count(link->consumer); pm_runtime_release_supplier(link); pm_request_idle(link->supplier); } static bool pm_runtime_need_not_resume(struct device *dev) { return atomic_read(&dev->power.usage_count) <= 1 && (atomic_read(&dev->power.child_count) == 0 || dev->power.ignore_children); } /** * pm_runtime_force_suspend - Force a device into suspend state if needed. * @dev: Device to suspend. * * Disable runtime PM so we safely can check the device's runtime PM status and * if it is active, invoke its ->runtime_suspend callback to suspend it and * change its runtime PM status field to RPM_SUSPENDED. Also, if the device's * usage and children counters don't indicate that the device was in use before * the system-wide transition under way, decrement its parent's children counter * (if there is a parent). Keep runtime PM disabled to preserve the state * unless we encounter errors. * * Typically this function may be invoked from a system suspend callback to make * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. * * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent * state where this function has called the ->runtime_suspend callback but the * PM core marks the driver as runtime active. */ int pm_runtime_force_suspend(struct device *dev) { int (*callback)(struct device *); int ret; pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) return 0; callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); ret = callback ? callback(dev) : 0; if (ret) goto err; dev_pm_enable_wake_irq_complete(dev); /* * If the device can stay in suspend after the system-wide transition * to the working state that will follow, drop the children counter of * its parent, but set its status to RPM_SUSPENDED anyway in case this * function will be called again for it in the meantime. */ if (pm_runtime_need_not_resume(dev)) { pm_runtime_set_suspended(dev); } else { __update_runtime_status(dev, RPM_SUSPENDED); dev->power.needs_force_resume = 1; } return 0; err: dev_pm_disable_wake_irq_check(dev, true); pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); /** * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. * * Prior invoking this function we expect the user to have brought the device * into low power state by a call to pm_runtime_force_suspend(). Here we reverse * those actions and bring the device into full power, if it is expected to be * used on system resume. In the other case, we defer the resume to be managed * via runtime PM. * * Typically this function may be invoked from a system resume callback. */ int pm_runtime_force_resume(struct device *dev) { int (*callback)(struct device *); int ret = 0; if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume) goto out; /* * The value of the parent's children counter is correct already, so * just update the status of the device. */ __update_runtime_status(dev, RPM_ACTIVE); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); ret = callback ? callback(dev) : 0; if (ret) { pm_runtime_set_suspended(dev); dev_pm_enable_wake_irq_check(dev, false); goto out; } pm_runtime_mark_last_busy(dev); out: dev->power.needs_force_resume = 0; pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_resume);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 #ifndef _UAPI_LINUX_VIRTIO_RING_H #define _UAPI_LINUX_VIRTIO_RING_H /* An interface for efficient virtio implementation, currently for use by KVM, * but hopefully others soon. Do NOT change this since it will * break existing servers and clients. * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of IBM nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright Rusty Russell IBM Corporation 2007. */ #ifndef __KERNEL__ #include <stdint.h> #endif #include <linux/types.h> #include <linux/virtio_types.h> /* This marks a buffer as continuing via the next field. */ #define VRING_DESC_F_NEXT 1 /* This marks a buffer as write-only (otherwise read-only). */ #define VRING_DESC_F_WRITE 2 /* This means the buffer contains a list of buffer descriptors. */ #define VRING_DESC_F_INDIRECT 4 /* * Mark a descriptor as available or used in packed ring. * Notice: they are defined as shifts instead of shifted values. */ #define VRING_PACKED_DESC_F_AVAIL 7 #define VRING_PACKED_DESC_F_USED 15 /* The Host uses this in used->flags to advise the Guest: don't kick me when * you add a buffer. It's unreliable, so it's simply an optimization. Guest * will still kick if it's out of buffers. */ #define VRING_USED_F_NO_NOTIFY 1 /* The Guest uses this in avail->flags to advise the Host: don't interrupt me * when you consume a buffer. It's unreliable, so it's simply an * optimization. */ #define VRING_AVAIL_F_NO_INTERRUPT 1 /* Enable events in packed ring. */ #define VRING_PACKED_EVENT_FLAG_ENABLE 0x0 /* Disable events in packed ring. */ #define VRING_PACKED_EVENT_FLAG_DISABLE 0x1 /* * Enable events for a specific descriptor in packed ring. * (as specified by Descriptor Ring Change Event Offset/Wrap Counter). * Only valid if VIRTIO_RING_F_EVENT_IDX has been negotiated. */ #define VRING_PACKED_EVENT_FLAG_DESC 0x2 /* * Wrap counter bit shift in event suppression structure * of packed ring. */ #define VRING_PACKED_EVENT_F_WRAP_CTR 15 /* We support indirect buffer descriptors */ #define VIRTIO_RING_F_INDIRECT_DESC 28 /* The Guest publishes the used index for which it expects an interrupt * at the end of the avail ring. Host should ignore the avail->flags field. */ /* The Host publishes the avail index for which it expects a kick * at the end of the used ring. Guest should ignore the used->flags field. */ #define VIRTIO_RING_F_EVENT_IDX 29 /* Alignment requirements for vring elements. * When using pre-virtio 1.0 layout, these fall out naturally. */ #define VRING_AVAIL_ALIGN_SIZE 2 #define VRING_USED_ALIGN_SIZE 4 #define VRING_DESC_ALIGN_SIZE 16 /** * struct vring_desc - Virtio ring descriptors, * 16 bytes long. These can chain together via @next. * * @addr: buffer address (guest-physical) * @len: buffer length * @flags: descriptor flags * @next: index of the next descriptor in the chain, * if the VRING_DESC_F_NEXT flag is set. We chain unused * descriptors via this, too. */ struct vring_desc { __virtio64 addr; __virtio32 len; __virtio16 flags; __virtio16 next; }; struct vring_avail { __virtio16 flags; __virtio16 idx; __virtio16 ring[]; }; /* u32 is used here for ids for padding reasons. */ struct vring_used_elem { /* Index of start of used descriptor chain. */ __virtio32 id; /* Total length of the descriptor chain which was used (written to) */ __virtio32 len; }; typedef struct vring_used_elem __attribute__((aligned(VRING_USED_ALIGN_SIZE))) vring_used_elem_t; struct vring_used { __virtio16 flags; __virtio16 idx; vring_used_elem_t ring[]; }; /* * The ring element addresses are passed between components with different * alignments assumptions. Thus, we might need to decrease the compiler-selected * alignment, and so must use a typedef to make sure the aligned attribute * actually takes hold: * * https://gcc.gnu.org/onlinedocs//gcc/Common-Type-Attributes.html#Common-Type-Attributes * * When used on a struct, or struct member, the aligned attribute can only * increase the alignment; in order to decrease it, the packed attribute must * be specified as well. When used as part of a typedef, the aligned attribute * can both increase and decrease alignment, and specifying the packed * attribute generates a warning. */ typedef struct vring_desc __attribute__((aligned(VRING_DESC_ALIGN_SIZE))) vring_desc_t; typedef struct vring_avail __attribute__((aligned(VRING_AVAIL_ALIGN_SIZE))) vring_avail_t; typedef struct vring_used __attribute__((aligned(VRING_USED_ALIGN_SIZE))) vring_used_t; struct vring { unsigned int num; vring_desc_t *desc; vring_avail_t *avail; vring_used_t *used; }; #ifndef VIRTIO_RING_NO_LEGACY /* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * * struct vring * { * // The actual descriptors (16 bytes each) * struct vring_desc desc[num]; * * // A ring of available descriptor heads with free-running index. * __virtio16 avail_flags; * __virtio16 avail_idx; * __virtio16 available[num]; * __virtio16 used_event_idx; * * // Padding to the next align boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. * __virtio16 used_flags; * __virtio16 used_idx; * struct vring_used_elem used[num]; * __virtio16 avail_event_idx; * }; */ /* We publish the used event index at the end of the available ring, and vice * versa. They are at the end for backwards compatibility. */ #define vring_used_event(vr) ((vr)->avail->ring[(vr)->num]) #define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num]) static inline void vring_init(struct vring *vr, unsigned int num, void *p, unsigned long align) { vr->num = num; vr->desc = p; vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } static inline unsigned vring_size(unsigned int num, unsigned long align) { return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num) + align - 1) & ~(align - 1)) + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num; } #endif /* VIRTIO_RING_NO_LEGACY */ /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */ /* Assuming a given event_idx value from the other side, if * we have just incremented index from old to new_idx, * should we trigger an event? */ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old) { /* Note: Xen has similar logic for notification hold-off * in include/xen/interface/io/ring.h with req_event and req_prod * corresponding to event_idx + 1 and new_idx respectively. * Note also that req_event and req_prod in Xen start at 1, * event indexes in virtio start at 0. */ return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old); } struct vring_packed_desc_event { /* Descriptor Ring Change Event Offset/Wrap Counter. */ __le16 off_wrap; /* Descriptor Ring Change Event Flags. */ __le16 flags; }; struct vring_packed_desc { /* Buffer Address. */ __le64 addr; /* Buffer Length. */ __le32 len; /* Buffer ID. */ __le16 id; /* The flags depending on descriptor type. */ __le16 flags; }; #endif /* _UAPI_LINUX_VIRTIO_RING_H */
115 84 138 4 773 7 5 2 2 6 18 18 313 37 50 11 34 224 29 3 83 15 83 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_MM_H #define _LINUX_SCHED_MM_H #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/gfp.h> #include <linux/sync_core.h> #include <linux/sched/coredump.h> /* * Routines for handling mm_structs */ extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. * @mm: The &struct mm_struct to pin. * * Make sure that @mm will not get freed even after the owning task * exits. This doesn't guarantee that the associated address space * will still exist later on and mmget_not_zero() has to be used before * accessing it. * * This is a preferred way to pin @mm for a longer/unbounded amount * of time. * * Use mmdrop() to release the reference acquired by mmgrab(). * * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) { atomic_inc(&mm->mm_count); } static inline void smp_mb__after_mmgrab(void) { smp_mb__after_atomic(); } extern void __mmdrop(struct mm_struct *mm); static inline void mmdrop(struct mm_struct *mm) { /* * The implicit full barrier implied by atomic_dec_and_test() is * required by the membarrier system call before returning to * user-space, after storing to rq->curr. */ if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } #ifdef CONFIG_PREEMPT_RT /* * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is * by far the least expensive way to do that. */ static inline void __mmdrop_delayed(struct rcu_head *rhp) { struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); __mmdrop(mm); } /* * Invoked from finish_task_switch(). Delegates the heavy lifting on RT * kernels via RCU. */ static inline void mmdrop_sched(struct mm_struct *mm) { /* Provides a full memory barrier. See mmdrop() */ if (atomic_dec_and_test(&mm->mm_count)) call_rcu(&mm->delayed_drop, __mmdrop_delayed); } #else static inline void mmdrop_sched(struct mm_struct *mm) { mmdrop(mm); } #endif /* Helpers for lazy TLB mm refcounting */ static inline void mmgrab_lazy_tlb(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) mmgrab(mm); } static inline void mmdrop_lazy_tlb(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) { mmdrop(mm); } else { /* * mmdrop_lazy_tlb must provide a full memory barrier, see the * membarrier comment finish_task_switch which relies on this. */ smp_mb(); } } static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm) { if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) mmdrop_sched(mm); else smp_mb(); /* see mmdrop_lazy_tlb() above */ } /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. * * Make sure that the address space of the given &struct mm_struct doesn't * go away. This does not protect against parts of the address space being * modified or freed, however. * * Never use this function to pin this address space for an * unbounded/indefinite amount of time. * * Use mmput() to release the reference acquired by mmget(). * * See also <Documentation/mm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) { atomic_inc(&mm->mm_users); } static inline bool mmget_not_zero(struct mm_struct *mm) { return atomic_inc_not_zero(&mm->mm_users); } /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); #ifdef CONFIG_MMU /* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ void mmput_async(struct mm_struct *); #endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* * Grab a reference to a task's mm, if it is not already going away * and ptrace_may_access with the mode parameter passed to it * succeeds. */ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct on exit() */ extern void exit_mm_release(struct task_struct *, struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct on exec() */ extern void exec_mm_release(struct task_struct *, struct mm_struct *); #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU #ifndef arch_get_mmap_end #define arch_get_mmap_end(addr, len, flags) (TASK_SIZE) #endif #ifndef arch_get_mmap_base #define arch_get_mmap_base(addr, base) (base) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) { bool ret; /* * need RCU to access ->real_parent if CLONE_VM was used along with * CLONE_PARENT. * * We check real_parent->mm == tsk->mm because CLONE_VFORK does not * imply CLONE_VM * * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus * ->real_parent is not necessarily the task doing vfork(), so in * theory we can't rely on task_lock() if we want to dereference it. * * And in this case we can't trust the real_parent->mm == tsk->mm * check, it can be false negative. But we do not care, if init or * another oom-unkillable task does this it should blame itself. */ rcu_read_lock(); ret = tsk->vfork_done && rcu_dereference(tsk->real_parent)->mm == tsk->mm; rcu_read_unlock(); return ret; } /* * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence */ if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } return flags; } #ifdef CONFIG_LOCKDEP extern void __fs_reclaim_acquire(unsigned long ip); extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else static inline void __fs_reclaim_acquire(unsigned long ip) { } static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif /* Any memory-allocation retry loop should use * memalloc_retry_wait(), and pass the flags for the most * constrained allocation attempt that might have failed. * This provides useful documentation of where loops are, * and a central place to fine tune the waiting as the MM * implementation changes. */ static inline void memalloc_retry_wait(gfp_t gfp_flags) { /* We use io_schedule_timeout because waiting for memory * typically included waiting for dirty pages to be * written out, which requires IO. */ __set_current_state(TASK_UNINTERRUPTIBLE); gfp_flags = current_gfp_context(gfp_flags); if (gfpflags_allow_blocking(gfp_flags) && !(gfp_flags & __GFP_NORETRY)) /* Probably waited already, no need for much more */ io_schedule_timeout(1); else /* Probably didn't wait, and has now released a lock, * so now is a good time to wait */ io_schedule_timeout(HZ/50); } /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate * * Similar to might_sleep() and other annotations, this can be used in functions * that might allocate, but often don't. Compiles to nothing without * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking. */ static inline void might_alloc(gfp_t gfp_mask) { fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } /** * memalloc_flags_save - Add a PF_* flag to current->flags, save old value * * This allows PF_* flags to be conveniently added, irrespective of current * value, and then the old version restored with memalloc_flags_restore(). */ static inline unsigned memalloc_flags_save(unsigned flags) { unsigned oldflags = ~current->flags & flags; current->flags |= flags; return oldflags; } static inline void memalloc_flags_restore(unsigned flags) { current->flags &= ~flags; } /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * * This functions marks the beginning of the GFP_NOIO allocation scope. * All further allocations will implicitly drop __GFP_IO flag and so * they are safe for the IO critical section from the allocation recursion * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * * Context: This function is safe to be used from any context. * Return: The saved flags to be passed to memalloc_noio_restore. */ static inline unsigned int memalloc_noio_save(void) { return memalloc_flags_save(PF_MEMALLOC_NOIO); } /** * memalloc_noio_restore - Ends the implicit GFP_NOIO scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_noio_save call. */ static inline void memalloc_noio_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope. * * This functions marks the beginning of the GFP_NOFS allocation scope. * All further allocations will implicitly drop __GFP_FS flag and so * they are safe for the FS critical section from the allocation recursion * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * * Context: This function is safe to be used from any context. * Return: The saved flags to be passed to memalloc_nofs_restore. */ static inline unsigned int memalloc_nofs_save(void) { return memalloc_flags_save(PF_MEMALLOC_NOFS); } /** * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_nofs_save call. */ static inline void memalloc_nofs_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope. * * This function marks the beginning of the __GFP_MEMALLOC allocation scope. * All further allocations will implicitly add the __GFP_MEMALLOC flag, which * prevents entering reclaim and allows access to all memory reserves. This * should only be used when the caller guarantees the allocation will allow more * memory to be freed very shortly, i.e. it needs to allocate some memory in * the process of freeing memory, and cannot reclaim due to potential recursion. * * Users of this scope have to be extremely careful to not deplete the reserves * completely and implement a throttling mechanism which controls the * consumption of the reserve based on the amount of freed memory. Usage of a * pre-allocated pool (e.g. mempool) should be always considered before using * this scope. * * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC * * Context: This function should not be used in an interrupt context as that one * does not give PF_MEMALLOC access to reserves. * See __gfp_pfmemalloc_flags(). * Return: The saved flags to be passed to memalloc_noreclaim_restore. */ static inline unsigned int memalloc_noreclaim_save(void) { return memalloc_flags_save(PF_MEMALLOC); } /** * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope. * @flags: Flags to restore. * * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save * function. Always make sure that the given flags is the return value from the * pairing memalloc_noreclaim_save call. */ static inline void memalloc_noreclaim_restore(unsigned int flags) { memalloc_flags_restore(flags); } /** * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope. * * This function marks the beginning of the ~__GFP_MOVABLE allocation scope. * All further allocations will implicitly remove the __GFP_MOVABLE flag, which * will constraint the allocations to zones that allow long term pinning, i.e. * not ZONE_MOVABLE zones. * * Return: The saved flags to be passed to memalloc_pin_restore. */ static inline unsigned int memalloc_pin_save(void) { return memalloc_flags_save(PF_MEMALLOC_PIN); } /** * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope. * @flags: Flags to restore. * * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function. * Always make sure that the given flags is the return value from the pairing * memalloc_pin_save call. */ static inline void memalloc_pin_restore(unsigned int flags) { memalloc_flags_restore(flags); } #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * * Please, make sure that caller has a reference to the passed memcg structure, * so its lifetime is guaranteed to exceed the scope between two * set_active_memcg() calls. * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { old = current->active_memcg; current->active_memcg = memcg; } return old; } #else static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { return NULL; } #endif #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), }; enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), MEMBARRIER_FLAG_RSEQ = (1U << 1), }; #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS #include <asm/membarrier.h> #endif static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; sync_core_before_usermode(); } extern void membarrier_exec_mmap(struct mm_struct *mm); extern void membarrier_update_current_mm(struct mm_struct *next_mm); #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { } #endif static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } static inline void membarrier_update_current_mm(struct mm_struct *next_mm) { } #endif #endif /* _LINUX_SCHED_MM_H */
71 37 70 3 30 6 116 113 7 111 113 113 112 112 113 113 1 112 27 86 57 34 26 9 112 53 75 90 26 113 26 26 26 23 26 25 26 26 26 26 26 25 26 1 26 26 26 13 26 32 32 32 32 32 32 32 32 29 29 29 22 29 29 32 3 23 23 2 3 3 2 2 2 1 1 3 2 2 1 2 2 7 7 1 6 1 7 1 1 1 2 2 1 1 1 3 2 4 2 1 1 1 4 4 3 2 1 3 3 14 11 13 14 12 11 5 9 7 3 9 4 3 7 2 7 2 2 2 192 3 27 6 27 27 27 3 191 53 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 // SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code * Alan Cox : Fixed some small errors in the ARP * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). * Alan Cox : Double lock fixes. * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. * Andrew Tridgell : Added ARP netmask code and * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. * Mike Shaver : RFC1122 checks. * Jonathan Naylor : Only lookup the hardware address for * the correct hardware type. * Germano Caronni : Assorted subtle races. * Craig Schlenter : Don't modify permanent entry * during arp_rcv. * Russ Nelson : Tidied up a few bits. * Alexey Kuznetsov: Major changes to caching and behaviour, * eg intelligent arp probing and * generation * of host down events. * Alan Cox : Missing unlock in device events. * Eckes : ARP ioctl control errors. * Alexey Kuznetsov: Arp free fix. * Manuel Rodriguez: Gratuitous ARP. * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. * Alexey Kuznetsov: new arp state machine; * now it is in net/core/neighbour.c. * Krzysztof Halasa: Added Frame Relay ARP support. * Arnaldo C. Melo : convert /proc/net/arp to seq_file * Shmulik Hen: Split arp_send to arp_create and * arp_xmit so intermediate drivers like * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, .reachable_time = 30 * HZ, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, [NEIGH_VAR_LOCKTIME] = 1 * HZ, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; case ARPHRD_IPGRE: ip_ipgre_mc_map(addr, dev->broadcast, haddr); return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return arp_hashfn(pkey, dev, hash_rnd); } static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) { return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) { __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; u32 inaddr_any = INADDR_ANY; if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; } static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; /* arp on this interface. */ if (dev->flags & IFF_NOARP) return; skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw, NULL); } EXPORT_SYMBOL(arp_send); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return; } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; if (inet_addr_type_dev_table(dev_net(dev), dev, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; } saddr = 0; break; case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } rcu_read_unlock(); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; } } if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { case 0: /* Reply, the tip is already validated */ return 0; case 1: /* Reply only if tip is configured on the incoming interface */ sip = 0; scope = RT_SCOPE_HOST; break; case 2: /* * Reply only if tip is configured on the incoming interface * and is in same subnet as sip */ scope = RT_SCOPE_HOST; break; case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; in_dev = NULL; break; case 4: /* Reserved */ case 5: case 6: case 7: return 0; case 8: /* Do not reply */ return 1; default: return 0; } return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_accept(struct in_device *in_dev, __be32 sip) { struct net *net = dev_net(in_dev->dev); int scope = RT_SCOPE_LINK; switch (IN_DEV_ARP_ACCEPT(in_dev)) { case 0: /* Don't create new entries from garp */ return 0; case 1: /* Create new entries from garp */ return 1; case 2: /* Create a neighbor in the arp table only if sip * is in the same subnet as an address configured * on the interface that received the garp message */ return !!inet_confirm_addr(net, in_dev, sip, 0, scope); default: return 0; } } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); return flag; } /* * Check if we can use proxy ARP for this path */ static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; imi = IN_DEV_MEDIUM_ID(in_dev); if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); return omi != imi && omi != -1; } /* * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) * * RFC3069 supports proxy arp replies back to the same interface. This * is done to support (ethernet) switch features, like RFC 3069, where * the individual ports are not allowed to communicate with each * other, BUT they are allowed to talk to the upstream router. As * described in RFC 3069, it is possible to allow these hosts to * communicate through the upstream router, by proxy_arp'ing. * * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" * * This technology is known by different names: * In RFC 3069 it is called VLAN Aggregation. * Cisco and Allied Telesyn call it Private VLAN. * Hewlett-Packard call it Source-Port filtering or port-isolation. * Ericsson call it MAC-Forced Forwarding (RFC Draft). * */ static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev, struct rtable *rt, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ if (sip == tip) return 0; if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) return 1; else return 0; } /* * Interface to link layer: send routine and receive handler. */ /* * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; /* * Allocate a buffer */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) src_hw = dev->dev_addr; if (!dest_hw) dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* * Fill out the arp protocol part. * * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ switch (dev->type) { default: arp->ar_hrd = htons(dev->type); arp->ar_pro = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); break; #endif #endif #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif } arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); arp_ptr += dev->addr_len; } memcpy(arp_ptr, &dest_ip, 4); return skb; out: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(arp_create); static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, dev_net(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); } EXPORT_SYMBOL(arp_xmit); static bool arp_is_garp(struct net *net, struct net_device *dev, int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && ar_op == htons(ARPOP_REPLY)) is_garp = /* IPv4 over IEEE 1394 doesn't provide target * hardware address field in its ARP payload. */ tha && !memcmp(tha, sha, dev->addr_len); if (is_garp) { *addr_type = inet_addr_type_dev_table(net, dev, sip); if (*addr_type != RTN_UNICAST) is_garp = false; } return is_garp; } /* * Process an arp request. */ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. */ if (!in_dev) goto out_free_skb; arp = arp_hdr(skb); switch (dev_type) { default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that * FDDI devices should accept ARP hardware of (1) Ethernet, * however, to be more robust, we'll accept both 1 (Ethernet) * or 6 (IEEE 802.2) */ if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) goto out_free_skb; break; } /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out_free_skb; /* * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), * there will be an ARP proxy and gratuitous ARP frames are attacks * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) sha = dev->broadcast; /* * Process entry. The idea here is we want to send a reply if it is a * request for us or if it is a request for someone else that we hold * a proxy for. We want to add an entry to our cache if it is a reply * to us or if it is a request for our address. * (The assumption for this last is that if someone is requesting our * address, they are probably intending to talk to us, so it saves time * if we cache their address. Their address is also probably not in * our cache, since ours is not in their cache.) * * Putting this another way, we only care about replies if they are to * us, in which case we add them to the cache. For requests, we care * about those for us and those for our proxies. We reply to both, * and in the case of requests for us we add the requester to the arp * cache. */ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) reply_dst = (struct dst_entry *) iptunnel_metadata_reply(skb_metadata_dst(skb), GFP_ATOMIC); /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { int dont_send; dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); neigh_release(n); } } goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); goto out_free_dst; } goto out_consume_skb; } } } /* Update our ARP tables */ n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || (addr_type < 0 && /* postpone calculation to as late as possible */ inet_addr_type_dev_table(net, dev, sip) == RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } if (n) { int state = NUD_REACHABLE; int override; /* If several different ARP replies follows back-to-back, use the FIRST one. It is possible, if several proxy agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ override = time_after(jiffies, n->updated + NEIGH_VAR(n->parms, LOCKTIME)) || is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. */ if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } out_consume_skb: consume_skb(skb); out_free_dst: dst_release(reply_dst); return NET_RX_SUCCESS; out_free_skb: kfree_skb(skb); return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) { arp_process(dev_net(skb->dev), NULL, skb); } static int arp_is_multicast(const void *pkey) { return ipv4_is_multicast(*((__be32 *)pkey)); } /* * Receive an arp request from the device layer. */ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto freeskb; arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) goto freeskb; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, dev_net(dev), NULL, skb, dev, NULL, arp_process); consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: return NET_RX_DROP; } /* * User level interface (ioctl) */ static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, bool getarp) { struct net_device *dev; if (getarp) dev = dev_get_by_name_rcu(net, r->arp_dev); else dev = __dev_get_by_name(net, r->arp_dev); if (!dev) return ERR_PTR(-ENODEV); /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ if (!r->arp_ha.sa_family) r->arp_ha.sa_family = dev->type; if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) return ERR_PTR(-EINVAL); return dev; } static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) { struct net_device *dev; struct rtable *rt; __be32 ip; if (r->arp_dev[0]) return arp_req_dev_by_name(net, r, false); if (r->arp_flags & ATF_PUBL) return NULL; ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return ERR_PTR(-EINVAL); return dev; } /* * Set (create) an ARP cache entry. */ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } if (__in_dev_get_rtnl(dev)) { IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); return 0; } return -ENXIO; } static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) return -ENOBUFS; return 0; } return arp_req_set_proxy(net, dev, 1); } static int arp_req_set(struct net *net, struct arpreq *r) { struct neighbour *neigh; struct net_device *dev; __be32 ip; int err; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP * hardware types of 1 (Ethernet). However, to be more * robust, we'll accept hardware types of either 1 (Ethernet) * or 6 (IEEE 802.2). */ if (r->arp_ha.sa_family != ARPHRD_FDDI && r->arp_ha.sa_family != ARPHRD_ETHER && r->arp_ha.sa_family != ARPHRD_IEEE802) return -EINVAL; break; #endif default: if (r->arp_ha.sa_family != dev->type) return -EINVAL; break; } ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) { r->arp_flags |= ATF_COM; state = NUD_PERMANENT; } err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; } static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) return ATF_COM; else return 0; } /* * Get an ARP cache entry. */ static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; struct net_device *dev; if (!r->arp_dev[0]) return -ENODEV; dev = arp_req_dev_by_name(net, r, true); if (IS_ERR(dev)) return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); if (!neigh) return -ENXIO; if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); return -ENXIO; } read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); neigh_release(neigh); r->arp_ha.sa_family = dev->type; netdev_copy_name(dev, r->arp_dev); return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) { neigh_release(neigh); return 0; } if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } return err; } static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_delete(&arp_tbl, net, &ip, dev); } return arp_req_set_proxy(net, dev, 0); } static int arp_req_delete(struct net *net, struct arpreq *r) { struct net_device *dev; __be32 ip; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return arp_invalidate(dev, ip, true); } /* * Handle an ARP layer I/O control request. */ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct arpreq r; __be32 *netmask; int err; switch (cmd) { case SIOCDARP: case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) return -EFAULT; break; default: return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) *netmask = htonl(0xFFFFFFFFUL); else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) return -EINVAL; switch (cmd) { case SIOCDARP: rtnl_lock(); err = arp_req_delete(net, &r); rtnl_unlock(); break; case SIOCSARP: rtnl_lock(); err = arp_req_set(net, &r); rtnl_unlock(); break; case SIOCGARP: rcu_read_lock(); err = arp_req_get(net, &r); rcu_read_unlock(); if (!err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; break; } return err; } static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct in_device *in_dev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) evict_nocarrier = true; else evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block arp_netdev_notifier = { .notifier_call = arp_netdev_event, }; /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. */ void arp_ifdown(struct net_device *dev) { neigh_ifdown(&arp_tbl, dev); } /* * Called once on startup. */ static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; #ifdef CONFIG_PROC_FS #if IS_ENABLED(CONFIG_AX25) /* * ax25 -> ASCII conversion */ static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; n = (a->ax25_call[6] >> 1) & 0x0F; if (n > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') { buf[0] = '*'; buf[1] = '\0'; } } #endif /* CONFIG_AX25 */ #define HBUFFERLEN 30 static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; int k, j; char tbuf[16]; struct net_device *dev = n->dev; int hatype = dev->type; read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ #if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { hbuffer[k++] = hex_asc_hi(n->ha[j]); hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } if (k != 0) --k; hbuffer[k] = 0; #if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } static void arp_format_pneigh_entry(struct seq_file *seq, struct pneigh_entry *n) { struct net_device *dev = n->dev; int hatype = dev ? dev->type : 0; char tbuf[16]; sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); } static int arp_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "IP address HW type Flags " "HW address Mask Device\n"); } else { struct neigh_seq_state *state = seq->private; if (state->flags & NEIGH_SEQ_IS_PNEIGH) arp_format_pneigh_entry(seq, v); else arp_format_neigh_entry(seq, v); } return 0; } static void *arp_seq_start(struct seq_file *seq, loff_t *pos) { /* Don't want to confuse "arp -a" w/ magic entries, * so we tell the generic iterator to skip NUD_NOARP. */ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); } static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = arp_seq_show, }; #endif /* CONFIG_PROC_FS */ static int __net_init arp_net_init(struct net *net) { if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops, sizeof(struct neigh_seq_state))) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, }; void __init arp_init(void) { neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); register_pernet_subsys(&arp_net_ops); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); }
68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 // SPDX-License-Identifier: GPL-2.0-only /* * Generic show_mem() implementation * * Copyright (C) 2008 Johannes Weiner <hannes@saeurebad.de> */ #include <linux/blkdev.h> #include <linux/cma.h> #include <linux/cpuset.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/swap.h> #include <linux/vmstat.h> #include "internal.h" #include "swap.h" atomic_long_t _totalram_pages __read_mostly; EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; static inline void show_node(struct zone *zone) { if (IS_ENABLED(CONFIG_NUMA)) printk("Node %d ", zone_to_nid(zone)); } long si_mem_available(void) { long available; unsigned long pagecache; unsigned long wmark_low = 0; unsigned long reclaimable; struct zone *zone; for_each_zone(zone) wmark_low += low_wmark_pages(zone); /* * Estimate the amount of memory available for userspace allocations, * without causing swapping or OOM. */ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will * start swapping or thrashing. Assume at least half of the page * cache, or the low watermark worth of cache, needs to stay. */ pagecache = global_node_page_state(NR_ACTIVE_FILE) + global_node_page_state(NR_INACTIVE_FILE); pagecache -= min(pagecache / 2, wmark_low); available += pagecache; /* * Part of the reclaimable slab and other kernel memory consists of * items that are in use, and cannot be freed. Cap this estimate at the * low watermark. */ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); reclaimable -= min(reclaimable / 2, wmark_low); available += reclaimable; if (available < 0) available = 0; return available; } EXPORT_SYMBOL_GPL(si_mem_available); void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages(); val->sharedram = global_node_page_state(NR_SHMEM); val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages(); val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; unsigned long managed_highpages = 0; unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); val->totalram = managed_pages; val->sharedram = node_page_state(pgdat, NR_SHMEM); val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (is_highmem(zone)) { managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } val->totalhigh = managed_highpages; val->freehigh = free_highpages; #else val->totalhigh = managed_highpages; val->freehigh = free_highpages; #endif val->mem_unit = PAGE_SIZE; } #endif /* * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { if (!(flags & SHOW_MEM_FILTER_NODES)) return false; /* * no node mask - aka implicit memory numa policy. Do not bother with * the synchronization - read_mems_allowed_begin - because we do not * have to be precise here. */ if (!nodemask) nodemask = &cpuset_current_mems_allowed; return !node_isset(nid, *nodemask); } static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U', [MIGRATE_MOVABLE] = 'M', [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_HIGHATOMIC] = 'H', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif #ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = 'I', #endif }; char tmp[MIGRATE_TYPES + 1]; char *p = tmp; int i; for (i = 0; i < MIGRATE_TYPES; i++) { if (type & (1 << i)) *p++ = types[i]; } *p = '\0'; printk(KERN_CONT "(%s) ", tmp); } static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) { int zone_idx; for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) if (zone_managed_pages(pgdat->node_zones + zone_idx)) return true; return false; } /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. * * Bits in @filter: * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; int cpu, nid; struct zone *zone; pg_data_t *pgdat; for_each_populated_zone(zone) { if (zone_idx(zone) > max_zone_idx) continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu\n" " sec_pagetables:%lu bounce:%lu\n" " kernel_misc_reclaimable:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_node_page_state(NR_ACTIVE_ANON), global_node_page_state(NR_INACTIVE_ANON), global_node_page_state(NR_ISOLATED_ANON), global_node_page_state(NR_ACTIVE_FILE), global_node_page_state(NR_INACTIVE_FILE), global_node_page_state(NR_ISOLATED_FILE), global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_node_page_state(NR_PAGETABLE), global_node_page_state(NR_SECONDARY_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), global_zone_page_state(NR_FREE_PAGES), free_pcp, global_zone_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; if (!node_has_managed_zones(pgdat, max_zone_idx)) continue; printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" " isolated(anon):%lukB" " isolated(file):%lukB" " mapped:%lukB" " dirty:%lukB" " writeback:%lukB" " shmem:%lukB" #ifdef CONFIG_TRANSPARENT_HUGEPAGE " shmem_thp:%lukB" " shmem_pmdmapped:%lukB" " anon_thp:%lukB" #endif " writeback_tmp:%lukB" " kernel_stack:%lukB" #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" #endif " pagetables:%lukB" " sec_pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, K(node_page_state(pgdat, NR_ACTIVE_ANON)), K(node_page_state(pgdat, NR_INACTIVE_ANON)), K(node_page_state(pgdat, NR_ACTIVE_FILE)), K(node_page_state(pgdat, NR_INACTIVE_FILE)), K(node_page_state(pgdat, NR_UNEVICTABLE)), K(node_page_state(pgdat, NR_ISOLATED_ANON)), K(node_page_state(pgdat, NR_ISOLATED_FILE)), K(node_page_state(pgdat, NR_FILE_MAPPED)), K(node_page_state(pgdat, NR_FILE_DIRTY)), K(node_page_state(pgdat, NR_WRITEBACK)), K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS)), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), K(node_page_state(pgdat, NR_ANON_THPS)), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)); } for_each_populated_zone(zone) { int i; if (zone_idx(zone) > max_zone_idx) continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; show_node(zone); printk(KERN_CONT "%s" " free:%lukB" " boost:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" " reserved_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" " writepending:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(zone->watermark_boost), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone->nr_reserved_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); printk(KERN_CONT "\n"); } for_each_populated_zone(zone) { unsigned int order; unsigned long nr[NR_PAGE_ORDERS], flags, total = 0; unsigned char types[NR_PAGE_ORDERS]; if (zone_idx(zone) > max_zone_idx) continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &zone->free_area[order]; int type; nr[order] = area->nr_free; total += nr[order] << order; types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { if (!free_area_empty(area, type)) types[order] |= 1 << type; } } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < NR_PAGE_ORDERS; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) show_migration_types(types[order]); } printk(KERN_CONT "= %lukB\n", K(total)); } for_each_online_node(nid) { if (show_mem_node_skip(filter, nid, nodemask)) continue; hugetlb_show_meminfo_node(nid); } printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long total = 0, reserved = 0, highmem = 0; struct zone *zone; printk("Mem-Info:\n"); show_free_areas(filter, nodemask, max_zone_idx); for_each_populated_zone(zone) { total += zone->present_pages; reserved += zone->present_pages - zone_managed_pages(zone); if (is_highmem(zone)) highmem += zone->present_pages; } printk("%lu pages RAM\n", total); printk("%lu pages HighMem/MovableOnly\n", highmem); printk("%lu pages reserved\n", reserved); #ifdef CONFIG_CMA printk("%lu pages cma reserved\n", totalcma_pages); #endif #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif #ifdef CONFIG_MEM_ALLOC_PROFILING { struct codetag_bytes tags[10]; size_t i, nr; nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false); if (nr) { pr_notice("Memory allocations:\n"); for (i = 0; i < nr; i++) { struct codetag *ct = tags[i].ct; struct alloc_tag *tag = ct_to_alloc_tag(ct); struct alloc_tag_counters counter = alloc_tag_read(tag); char bytes[10]; string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes)); /* Same as alloc_tag_to_text() but w/o intermediate buffer */ if (ct->modname) pr_notice("%12s %8llu %s:%u [%s] func:%s\n", bytes, counter.calls, ct->filename, ct->lineno, ct->modname, ct->function); else pr_notice("%12s %8llu %s:%u func:%s\n", bytes, counter.calls, ct->filename, ct->lineno, ct->function); } } } #endif }
1106 1014 1601 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H #include <linux/bitops.h> #include <linux/wordpart.h> struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } /* Return nonzero if it has a zero */ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; *bits = mask; return mask; } static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) { return bits; } #ifdef CONFIG_64BIT /* Keep the initial has_zero() value for both bitmask and size calc */ #define create_zero_mask(bits) (bits) static inline unsigned long zero_bytemask(unsigned long bits) { bits = (bits - 1) & ~bits; return bits >> 7; } #define find_zero(bits) (__ffs(bits) >> 3) #else /* Create the final mask for both bytemask and size */ static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; return bits >> 7; } /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) /* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ static inline unsigned long find_zero(unsigned long mask) { /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ long a = (0x0ff0001+mask) >> 23; /* Fix the 1 for 00 case */ return a & mask; } #endif /* * Load an unaligned word from kernel space. * * In the (very unlikely) case of the word being a page-crosser * and the next page not being mapped, take the exception and * return zeroes in the non-existing part. */ static inline unsigned long load_unaligned_zeropad(const void *addr) { unsigned long ret; asm volatile( "1: mov %[mem], %[ret]\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD) : [ret] "=r" (ret) : [mem] "m" (*(unsigned long *)addr)); return ret; } #endif /* _ASM_WORD_AT_A_TIME_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 /* * Compressed rom filesystem for Linux. * * Copyright (C) 1999 Linus Torvalds. * * This file is released under the GPL. */ /* * These are the VFS interfaces to the compressed rom filesystem. * The actual compression is based on zlib, see the other files. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/pfn_t.h> #include <linux/ramfs.h> #include <linux/init.h> #include <linux/string.h> #include <linux/blkdev.h> #include <linux/mtd/mtd.h> #include <linux/mtd/super.h> #include <linux/fs_context.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/mutex.h> #include <uapi/linux/cramfs_fs.h> #include <linux/uaccess.h> #include "internal.h" /* * cramfs super-block data in memory */ struct cramfs_sb_info { unsigned long magic; unsigned long size; unsigned long blocks; unsigned long files; unsigned long flags; void *linear_virt_addr; resource_size_t linear_phys_addr; size_t mtd_point_size; }; static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) { return sb->s_fs_info; } static const struct super_operations cramfs_ops; static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; static const struct file_operations cramfs_physmem_fops; static const struct address_space_operations cramfs_aops; static DEFINE_MUTEX(read_mutex); /* These macros may change in future, to provide better st_ino semantics. */ #define OFFSET(x) ((x)->i_ino) static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset) { if (!cino->offset) return offset + 1; if (!cino->size) return offset + 1; /* * The file mode test fixes buggy mkcramfs implementations where * cramfs_inode->offset is set to a non zero value for entries * which did not contain data, like devices node and fifos. */ switch (cino->mode & S_IFMT) { case S_IFREG: case S_IFDIR: case S_IFLNK: return cino->offset << 2; default: break; } return offset + 1; } static struct inode *get_cramfs_inode(struct super_block *sb, const struct cramfs_inode *cramfs_inode, unsigned int offset) { struct inode *inode; static struct timespec64 zerotime; inode = iget_locked(sb, cramino(cramfs_inode, offset)); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; switch (cramfs_inode->mode & S_IFMT) { case S_IFREG: inode->i_fop = &generic_ro_fops; inode->i_data.a_ops = &cramfs_aops; if (IS_ENABLED(CONFIG_CRAMFS_MTD) && CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS && CRAMFS_SB(sb)->linear_phys_addr) inode->i_fop = &cramfs_physmem_fops; break; case S_IFDIR: inode->i_op = &cramfs_dir_inode_operations; inode->i_fop = &cramfs_directory_operations; break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; default: init_special_inode(inode, cramfs_inode->mode, old_decode_dev(cramfs_inode->size)); } inode->i_mode = cramfs_inode->mode; i_uid_write(inode, cramfs_inode->uid); i_gid_write(inode, cramfs_inode->gid); /* if the lower 2 bits are zero, the inode contains data */ if (!(inode->i_ino & 3)) { inode->i_size = cramfs_inode->size; inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; } /* Struct copy intentional */ inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime))); /* inode->i_nlink is left 1 - arguably wrong for directories, but it's the best we can do without reading the directory contents. 1 yields the right result in GNU find, even without -noleaf option. */ unlock_new_inode(inode); return inode; } /* * We have our own block cache: don't fill up the buffer cache * with the rom-image, because the way the filesystem is set * up the accesses should be fairly regular and cached in the * page cache and dentry tree anyway.. * * This also acts as a way to guarantee contiguous areas of up to * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to * worry about end-of-buffer issues even when decompressing a full * page cache. * * Note: This is all optimized away at compile time when * CONFIG_CRAMFS_BLOCKDEV=n. */ #define READ_BUFFERS (2) /* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */ #define NEXT_BUFFER(_ix) ((_ix) ^ 1) /* * BLKS_PER_BUF_SHIFT should be at least 2 to allow for "compressed" * data that takes up more space than the original and with unlucky * alignment. */ #define BLKS_PER_BUF_SHIFT (2) #define BLKS_PER_BUF (1 << BLKS_PER_BUF_SHIFT) #define BUFFER_SIZE (BLKS_PER_BUF*PAGE_SIZE) static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; static unsigned buffer_blocknr[READ_BUFFERS]; static struct super_block *buffer_dev[READ_BUFFERS]; static int next_buffer; /* * Populate our block cache and return a pointer to it. */ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_mapping; struct file_ra_state ra = {}; struct page *pages[BLKS_PER_BUF]; unsigned i, blocknr, buffer; unsigned long devsize; char *data; if (!len) return NULL; blocknr = offset >> PAGE_SHIFT; offset &= PAGE_SIZE - 1; /* Check if an existing buffer already has the data.. */ for (i = 0; i < READ_BUFFERS; i++) { unsigned int blk_offset; if (buffer_dev[i] != sb) continue; if (blocknr < buffer_blocknr[i]) continue; blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT; blk_offset += offset; if (blk_offset > BUFFER_SIZE || blk_offset + len > BUFFER_SIZE) continue; return read_buffers[i] + blk_offset; } devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ file_ra_state_init(&ra, mapping); page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF); for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = NULL; if (blocknr + i < devsize) { page = read_mapping_page(mapping, blocknr + i, NULL); /* synchronous error? */ if (IS_ERR(page)) page = NULL; } pages[i] = page; } buffer = next_buffer; next_buffer = NEXT_BUFFER(buffer); buffer_blocknr[buffer] = blocknr; buffer_dev[buffer] = sb; data = read_buffers[buffer]; for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; if (page) { memcpy_from_page(data, page, 0, PAGE_SIZE); put_page(page); } else memset(data, 0, PAGE_SIZE); data += PAGE_SIZE; } return read_buffers[buffer] + offset; } /* * Return a pointer to the linearly addressed cramfs image in memory. */ static void *cramfs_direct_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); if (!len) return NULL; if (len > sbi->size || offset > sbi->size - len) return page_address(ZERO_PAGE(0)); return sbi->linear_virt_addr + offset; } /* * Returns a pointer to a buffer containing at least LEN bytes of * filesystem starting at byte offset OFFSET into the filesystem. */ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr) return cramfs_direct_read(sb, offset, len); else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) return cramfs_blkdev_read(sb, offset, len); else return NULL; } /* * For a mapping to be possible, we need a range of uncompressed and * contiguous blocks. Return the offset for the first block and number of * valid blocks for which that is true, or zero otherwise. */ static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages) { struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); int i; u32 *blockptrs, first_block_addr; /* * We can dereference memory directly here as this code may be * reached only when there is a direct filesystem image mapping * available in memory. */ blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4); first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS; i = 0; do { u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT); u32 expect = (first_block_addr + block_off) | CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED; if (blockptrs[i] != expect) { pr_debug("range: block %d/%d got %#x expects %#x\n", pgoff+i, pgoff + *pages - 1, blockptrs[i], expect); if (i == 0) return 0; break; } } while (++i < *pages); *pages = i; return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT; } #ifdef CONFIG_MMU /* * Return true if the last page of a file in the filesystem image contains * some other data that doesn't belong to that file. It is assumed that the * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED * (verified by cramfs_get_block_range() and directly accessible in memory. */ static bool cramfs_last_page_is_shared(struct inode *inode) { struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); u32 partial, last_page, blockaddr, *blockptrs; char *tail_data; partial = offset_in_page(inode->i_size); if (!partial) return false; last_page = inode->i_size >> PAGE_SHIFT; blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode)); blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS; blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; tail_data = sbi->linear_virt_addr + blockaddr + partial; return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false; } static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); unsigned int pages, max_pages, offset; unsigned long address, pgoff = vma->vm_pgoff; char *bailout_reason; int ret; ret = generic_file_readonly_mmap(file, vma); if (ret) return ret; /* * Now try to pre-populate ptes for this vma with a direct * mapping avoiding memory allocation when possible. */ /* Could COW work here? */ bailout_reason = "vma is writable"; if (vma->vm_flags & VM_WRITE) goto bailout; max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; bailout_reason = "beyond file limit"; if (pgoff >= max_pages) goto bailout; pages = min(vma_pages(vma), max_pages - pgoff); offset = cramfs_get_block_range(inode, pgoff, &pages); bailout_reason = "unsuitable block layout"; if (!offset) goto bailout; address = sbi->linear_phys_addr + offset; bailout_reason = "data is not page aligned"; if (!PAGE_ALIGNED(address)) goto bailout; /* Don't map the last page if it contains some other data */ if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) { pr_debug("mmap: %pD: last page is shared\n", file); pages--; } if (!pages) { bailout_reason = "no suitable block remaining"; goto bailout; } if (pages == vma_pages(vma)) { /* * The entire vma is mappable. remap_pfn_range() will * make it distinguishable from a non-direct mapping * in /proc/<pid>/maps by substituting the file offset * with the actual physical address. */ ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, pages * PAGE_SIZE, vma->vm_page_prot); } else { /* * Let's create a mixed map if we can't map it all. * The normal paging machinery will take care of the * unpopulated ptes via cramfs_read_folio(). */ int i; vm_flags_set(vma, VM_MIXEDMAP); for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } } if (!ret) pr_debug("mapped %pD[%lu] at 0x%08lx (%u/%lu pages) " "to vma 0x%08lx, page_prot 0x%llx\n", file, pgoff, address, pages, vma_pages(vma), vma->vm_start, (unsigned long long)pgprot_val(vma->vm_page_prot)); return ret; bailout: pr_debug("%pD[%lu]: direct mmap impossible: %s\n", file, pgoff, bailout_reason); /* Didn't manage any direct map, but normal paging is still possible */ return 0; } #else /* CONFIG_MMU */ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct cramfs_sb_info *sbi = CRAMFS_SB(sb); unsigned int pages, block_pages, max_pages, offset; pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= max_pages || pages > max_pages - pgoff) return -EINVAL; block_pages = pages; offset = cramfs_get_block_range(inode, pgoff, &block_pages); if (!offset || block_pages != pages) return -ENOSYS; addr = sbi->linear_phys_addr + offset; pr_debug("get_unmapped for %pD ofs %#lx siz %lu at 0x%08lx\n", file, pgoff*PAGE_SIZE, len, addr); return addr; } static unsigned int cramfs_physmem_mmap_capabilities(struct file *file) { return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_EXEC; } #endif /* CONFIG_MMU */ static const struct file_operations cramfs_physmem_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .splice_read = filemap_splice_read, .mmap = cramfs_physmem_mmap, #ifndef CONFIG_MMU .get_unmapped_area = cramfs_physmem_get_unmapped_area, .mmap_capabilities = cramfs_physmem_mmap_capabilities, #endif }; static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); generic_shutdown_super(sb); if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); put_mtd_device(sb->s_mtd); sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); bdev_fput(sb->s_bdev_file); } kfree(sbi); } static int cramfs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; return 0; } static int cramfs_read_super(struct super_block *sb, struct fs_context *fc, struct cramfs_super *super) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); unsigned long root_offset; bool silent = fc->sb_flags & SB_SILENT; /* We don't know the real size yet */ sbi->size = PAGE_SIZE; /* Read the first block and get the superblock from it */ mutex_lock(&read_mutex); memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super)); mutex_unlock(&read_mutex); /* Do sanity checks on the superblock */ if (super->magic != CRAMFS_MAGIC) { /* check for wrong endianness */ if (super->magic == CRAMFS_MAGIC_WEND) { if (!silent) errorfc(fc, "wrong endianness"); return -EINVAL; } /* check at 512 byte offset */ mutex_lock(&read_mutex); memcpy(super, cramfs_read(sb, 512, sizeof(*super)), sizeof(*super)); mutex_unlock(&read_mutex); if (super->magic != CRAMFS_MAGIC) { if (super->magic == CRAMFS_MAGIC_WEND && !silent) errorfc(fc, "wrong endianness"); else if (!silent) errorfc(fc, "wrong magic"); return -EINVAL; } } /* get feature flags first */ if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) { errorfc(fc, "unsupported filesystem features"); return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super->root.mode)) { errorfc(fc, "root is not a directory"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ super->root.mode |= 0555; root_offset = super->root.offset << 2; if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) { sbi->size = super->size; sbi->blocks = super->fsid.blocks; sbi->files = super->fsid.files; } else { sbi->size = 1<<28; sbi->blocks = 0; sbi->files = 0; } sbi->magic = super->magic; sbi->flags = super->flags; if (root_offset == 0) infofc(fc, "empty filesystem"); else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { errorfc(fc, "bad root offset %lu", root_offset); return -EINVAL; } return 0; } static int cramfs_finalize_super(struct super_block *sb, struct cramfs_inode *cramfs_root) { struct inode *root; /* Set it all up.. */ sb->s_flags |= SB_RDONLY; sb->s_time_min = 0; sb->s_time_max = 0; sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, cramfs_root, 0); if (IS_ERR(root)) return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; return 0; } static int cramfs_blkdev_fill_super(struct super_block *sb, struct fs_context *fc) { struct cramfs_sb_info *sbi; struct cramfs_super super; int i, err; sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; /* Invalidate the read buffers on mount: think disk change.. */ for (i = 0; i < READ_BUFFERS; i++) buffer_blocknr[i] = -1; err = cramfs_read_super(sb, fc, &super); if (err) return err; return cramfs_finalize_super(sb, &super.root); } static int cramfs_mtd_fill_super(struct super_block *sb, struct fs_context *fc) { struct cramfs_sb_info *sbi; struct cramfs_super super; int err; sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; /* Map only one page for now. Will remap it when fs size is known. */ err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size, &sbi->linear_virt_addr, &sbi->linear_phys_addr); if (err || sbi->mtd_point_size != PAGE_SIZE) { pr_err("unable to get direct memory access to mtd:%s\n", sb->s_mtd->name); return err ? : -ENODATA; } pr_info("checking physical address %pap for linear cramfs image\n", &sbi->linear_phys_addr); err = cramfs_read_super(sb, fc, &super); if (err) return err; /* Remap the whole filesystem now */ pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n", sb->s_mtd->name, sbi->size/1024); mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE); err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size, &sbi->linear_virt_addr, &sbi->linear_phys_addr); if (err || sbi->mtd_point_size != sbi->size) { pr_err("unable to get direct memory access to mtd:%s\n", sb->s_mtd->name); return err ? : -ENODATA; } return cramfs_finalize_super(sb, &super.root); } static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = 0; if (sb->s_bdev) id = huge_encode_dev(sb->s_bdev->bd_dev); else if (sb->s_dev) id = huge_encode_dev(sb->s_dev); buf->f_type = CRAMFS_MAGIC; buf->f_bsize = PAGE_SIZE; buf->f_blocks = CRAMFS_SB(sb)->blocks; buf->f_bfree = 0; buf->f_bavail = 0; buf->f_files = CRAMFS_SB(sb)->files; buf->f_ffree = 0; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = CRAMFS_MAXPATHLEN; return 0; } /* * Read a cramfs directory entry. */ static int cramfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; char *buf; unsigned int offset; /* Offset within the thing. */ if (ctx->pos >= inode->i_size) return 0; offset = ctx->pos; /* Directory entries are always 4-byte aligned */ if (offset & 3) return -EINVAL; buf = kmalloc(CRAMFS_MAXPATHLEN, GFP_KERNEL); if (!buf) return -ENOMEM; while (offset < inode->i_size) { struct cramfs_inode *de; unsigned long nextoffset; char *name; ino_t ino; umode_t mode; int namelen; mutex_lock(&read_mutex); de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); name = (char *)(de+1); /* * Namelengths on disk are shifted by two * and the name padded out to 4-byte boundaries * with zeroes. */ namelen = de->namelen << 2; memcpy(buf, name, namelen); ino = cramino(de, OFFSET(inode) + offset); mode = de->mode; mutex_unlock(&read_mutex); nextoffset = offset + sizeof(*de) + namelen; for (;;) { if (!namelen) { kfree(buf); return -EIO; } if (buf[namelen-1]) break; namelen--; } if (!dir_emit(ctx, buf, namelen, ino, mode >> 12)) break; ctx->pos = offset = nextoffset; } kfree(buf); return 0; } /* * Lookup and fill in the inode data.. */ static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned int offset = 0; struct inode *inode = NULL; int sorted; mutex_lock(&read_mutex); sorted = CRAMFS_SB(dir->i_sb)->flags & CRAMFS_FLAG_SORTED_DIRS; while (offset < dir->i_size) { struct cramfs_inode *de; char *name; int namelen, retval; int dir_off = OFFSET(dir) + offset; de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN); name = (char *)(de+1); /* Try to take advantage of sorted directories */ if (sorted && (dentry->d_name.name[0] < name[0])) break; namelen = de->namelen << 2; offset += sizeof(*de) + namelen; /* Quick check that the name is roughly the right length */ if (((dentry->d_name.len + 3) & ~3) != namelen) continue; for (;;) { if (!namelen) { inode = ERR_PTR(-EIO); goto out; } if (name[namelen-1]) break; namelen--; } if (namelen != dentry->d_name.len) continue; retval = memcmp(dentry->d_name.name, name, namelen); if (retval > 0) continue; if (!retval) { inode = get_cramfs_inode(dir->i_sb, de, dir_off); break; } /* else (retval < 0) */ if (sorted) break; } out: mutex_unlock(&read_mutex); return d_splice_alias(inode, dentry); } static int cramfs_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; u32 maxblock; int bytes_filled; void *pgdata; bool success = false; maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; bytes_filled = 0; pgdata = kmap_local_folio(folio, 0); if (folio->index < maxblock) { struct super_block *sb = inode->i_sb; u32 blkptr_offset = OFFSET(inode) + folio->index * 4; u32 block_ptr, block_start, block_len; bool uncompressed, direct; mutex_lock(&read_mutex); block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4); uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED); direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR); block_ptr &= ~CRAMFS_BLK_FLAGS; if (direct) { /* * The block pointer is an absolute start pointer, * shifted by 2 bits. The size is included in the * first 2 bytes of the data block when compressed, * or PAGE_SIZE otherwise. */ block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT; if (uncompressed) { block_len = PAGE_SIZE; /* if last block: cap to file length */ if (folio->index == maxblock - 1) block_len = offset_in_page(inode->i_size); } else { block_len = *(u16 *) cramfs_read(sb, block_start, 2); block_start += 2; } } else { /* * The block pointer indicates one past the end of * the current block (start of next block). If this * is the first block then it starts where the block * pointer table ends, otherwise its start comes * from the previous block's pointer. */ block_start = OFFSET(inode) + maxblock * 4; if (folio->index) block_start = *(u32 *) cramfs_read(sb, blkptr_offset - 4, 4); /* Beware... previous ptr might be a direct ptr */ if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) { /* See comments on earlier code. */ u32 prev_start = block_start; block_start = prev_start & ~CRAMFS_BLK_FLAGS; block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) { block_start += PAGE_SIZE; } else { block_len = *(u16 *) cramfs_read(sb, block_start, 2); block_start += 2 + block_len; } } block_start &= ~CRAMFS_BLK_FLAGS; block_len = block_ptr - block_start; } if (block_len == 0) ; /* hole */ else if (unlikely(block_len > 2*PAGE_SIZE || (uncompressed && block_len > PAGE_SIZE))) { mutex_unlock(&read_mutex); pr_err("bad data blocksize %u\n", block_len); goto err; } else if (uncompressed) { memcpy(pgdata, cramfs_read(sb, block_start, block_len), block_len); bytes_filled = block_len; } else { bytes_filled = cramfs_uncompress_block(pgdata, PAGE_SIZE, cramfs_read(sb, block_start, block_len), block_len); } mutex_unlock(&read_mutex); if (unlikely(bytes_filled < 0)) goto err; } memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled); flush_dcache_folio(folio); success = true; err: kunmap_local(pgdata); folio_end_read(folio, success); return 0; } static const struct address_space_operations cramfs_aops = { .read_folio = cramfs_read_folio }; /* * Our operations: */ /* * A directory can only readdir */ static const struct file_operations cramfs_directory_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = cramfs_readdir, }; static const struct inode_operations cramfs_dir_inode_operations = { .lookup = cramfs_lookup, }; static const struct super_operations cramfs_ops = { .statfs = cramfs_statfs, }; static int cramfs_get_tree(struct fs_context *fc) { int ret = -ENOPROTOOPT; if (IS_ENABLED(CONFIG_CRAMFS_MTD)) { ret = get_tree_mtd(fc, cramfs_mtd_fill_super); if (!ret) return 0; } if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) ret = get_tree_bdev(fc, cramfs_blkdev_fill_super); return ret; } static const struct fs_context_operations cramfs_context_ops = { .get_tree = cramfs_get_tree, .reconfigure = cramfs_reconfigure, }; /* * Set up the filesystem mount context. */ static int cramfs_init_fs_context(struct fs_context *fc) { fc->ops = &cramfs_context_ops; return 0; } static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", .init_fs_context = cramfs_init_fs_context, .kill_sb = cramfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("cramfs"); static int __init init_cramfs_fs(void) { int rv; rv = cramfs_uncompress_init(); if (rv < 0) return rv; rv = register_filesystem(&cramfs_fs_type); if (rv < 0) cramfs_uncompress_exit(); return rv; } static void __exit exit_cramfs_fs(void) { cramfs_uncompress_exit(); unregister_filesystem(&cramfs_fs_type); } module_init(init_cramfs_fs) module_exit(exit_cramfs_fs) MODULE_DESCRIPTION("Compressed ROM file system support"); MODULE_LICENSE("GPL");
2 91 104 860 7 988 6 1037 1037 4580 1 186 1181 4571 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 /* SPDX-License-Identifier: GPL-2.0 */ /* thread_info.h: common low-level thread information accessors * * Copyright (C) 2002 David Howells (dhowells@redhat.com) * - Incorporating suggestions made by Linus Torvalds */ #ifndef _LINUX_THREAD_INFO_H #define _LINUX_THREAD_INFO_H #include <linux/types.h> #include <linux/limits.h> #include <linux/bug.h> #include <linux/restart_block.h> #include <linux/errno.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels, * including <asm/current.h> can cause a circular dependency on some platforms. */ #include <asm/current.h> #define current_thread_info() ((struct thread_info *)current) #endif #include <linux/bitops.h> /* * For per-arch arch_within_stack_frames() implementations, defined in * asm/thread_info.h. */ enum { BAD_STACK = -1, NOT_STACK = 0, GOOD_FRAME, GOOD_STACK, }; #ifdef CONFIG_GENERIC_ENTRY enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) #endif #include <asm/thread_info.h> #ifndef TIF_NEED_RESCHED_LAZY #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY #error Inconsistent PREEMPT_LAZY #endif #define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif #ifdef __KERNEL__ #ifndef arch_set_restart_data #define arch_set_restart_data(restart) do { } while (0) #endif static inline long set_restart_fn(struct restart_block *restart, long (*fn)(struct restart_block *)) { restart->fn = fn; arch_set_restart_data(restart); return -ERESTART_RESTARTBLOCK; } #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions */ static inline void set_ti_thread_flag(struct thread_info *ti, int flag) { set_bit(flag, (unsigned long *)&ti->flags); } static inline void clear_ti_thread_flag(struct thread_info *ti, int flag) { clear_bit(flag, (unsigned long *)&ti->flags); } static inline void update_ti_thread_flag(struct thread_info *ti, int flag, bool value) { if (value) set_ti_thread_flag(ti, flag); else clear_ti_thread_flag(ti, flag); } static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_set_bit(flag, (unsigned long *)&ti->flags); } static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_clear_bit(flag, (unsigned long *)&ti->flags); } static inline int test_ti_thread_flag(struct thread_info *ti, int flag) { return test_bit(flag, (unsigned long *)&ti->flags); } /* * This may be used in noinstr code, and needs to be __always_inline to prevent * inadvertent instrumentation. */ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti) { return READ_ONCE(ti->flags); } #define set_thread_flag(flag) \ set_ti_thread_flag(current_thread_info(), flag) #define clear_thread_flag(flag) \ clear_ti_thread_flag(current_thread_info(), flag) #define update_thread_flag(flag, value) \ update_ti_thread_flag(current_thread_info(), flag, value) #define test_and_set_thread_flag(flag) \ test_and_set_ti_thread_flag(current_thread_info(), flag) #define test_and_clear_thread_flag(flag) \ test_and_clear_ti_thread_flag(current_thread_info(), flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) #define read_thread_flags() \ read_ti_thread_flags(current_thread_info()) #define read_task_thread_flags(t) \ read_ti_thread_flags(task_thread_info(t)) #ifdef CONFIG_GENERIC_ENTRY #define set_syscall_work(fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define test_syscall_work(fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define clear_syscall_work(fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work) #define set_task_syscall_work(t, fl) \ set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define test_task_syscall_work(t, fl) \ test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #define clear_task_syscall_work(t, fl) \ clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) #else /* CONFIG_GENERIC_ENTRY */ #define set_syscall_work(fl) \ set_ti_thread_flag(current_thread_info(), TIF_##fl) #define test_syscall_work(fl) \ test_ti_thread_flag(current_thread_info(), TIF_##fl) #define clear_syscall_work(fl) \ clear_ti_thread_flag(current_thread_info(), TIF_##fl) #define set_task_syscall_work(t, fl) \ set_ti_thread_flag(task_thread_info(t), TIF_##fl) #define test_task_syscall_work(t, fl) \ test_ti_thread_flag(task_thread_info(t), TIF_##fl) #define clear_task_syscall_work(t, fl) \ clear_ti_thread_flag(task_thread_info(t), TIF_##fl) #endif /* !CONFIG_GENERIC_ENTRY */ #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H static __always_inline bool tif_test_bit(int bit) { return arch_test_bit(bit, (unsigned long *)(&current_thread_info()->flags)); } #else static __always_inline bool tif_test_bit(int bit) { return test_bit(bit, (unsigned long *)(&current_thread_info()->flags)); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ static __always_inline bool tif_need_resched(void) { return tif_test_bit(TIF_NEED_RESCHED); } #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, const void *obj, unsigned long len) { return 0; } #endif #ifdef CONFIG_HARDENED_USERCOPY extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); void __copy_overflow(int size, unsigned long count); static inline void copy_overflow(int size, unsigned long count) { if (IS_ENABLED(CONFIG_BUG)) __copy_overflow(size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __builtin_object_size(addr, 0); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif void arch_task_cache_init(void); /* for CONFIG_SH */ void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */
365 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_COMPLETION_H #define __LINUX_COMPLETION_H /* * (C) Copyright 2001 Linus Torvalds * * Atomic wait-for-completion handler data structures. * See kernel/sched/completion.c for details. */ #include <linux/swait.h> /* * struct completion - structure used to maintain state for a "completion" * * This is the opaque structure used to maintain the state for a "completion". * Completions currently use a FIFO to queue threads that have to wait for * the "completion" event. * * See also: complete(), wait_for_completion() (and friends _timeout, * _interruptible, _interruptible_timeout, and _killable), init_completion(), * reinit_completion(), and macros DECLARE_COMPLETION(), * DECLARE_COMPLETION_ONSTACK(). */ struct completion { unsigned int done; struct swait_queue_head wait; }; #define init_completion_map(x, m) init_completion(x) static inline void complete_acquire(struct completion *x) {} static inline void complete_release(struct completion *x) {} #define COMPLETION_INITIALIZER(work) \ { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) } #define COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) \ (*({ init_completion_map(&(work), &(map)); &(work); })) #define COMPLETION_INITIALIZER_ONSTACK(work) \ (*({ init_completion(&work); &work; })) /** * DECLARE_COMPLETION - declare and initialize a completion structure * @work: identifier for the completion structure * * This macro declares and initializes a completion structure. Generally used * for static declarations. You should use the _ONSTACK variant for automatic * variables. */ #define DECLARE_COMPLETION(work) \ struct completion work = COMPLETION_INITIALIZER(work) /* * Lockdep needs to run a non-constant initializer for on-stack * completions - so we use the _ONSTACK() variant for those that * are on the kernel stack: */ /** * DECLARE_COMPLETION_ONSTACK - declare and initialize a completion structure * @work: identifier for the completion structure * * This macro declares and initializes a completion structure on the kernel * stack. */ #ifdef CONFIG_LOCKDEP # define DECLARE_COMPLETION_ONSTACK(work) \ struct completion work = COMPLETION_INITIALIZER_ONSTACK(work) # define DECLARE_COMPLETION_ONSTACK_MAP(work, map) \ struct completion work = COMPLETION_INITIALIZER_ONSTACK_MAP(work, map) #else # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work) # define DECLARE_COMPLETION_ONSTACK_MAP(work, map) DECLARE_COMPLETION(work) #endif /** * init_completion - Initialize a dynamically allocated completion * @x: pointer to completion structure that is to be initialized * * This inline function will initialize a dynamically created completion * structure. */ static inline void init_completion(struct completion *x) { x->done = 0; init_swait_queue_head(&x->wait); } /** * reinit_completion - reinitialize a completion structure * @x: pointer to completion structure that is to be reinitialized * * This inline function should be used to reinitialize a completion structure so it can * be reused. This is especially important after complete_all() is used. */ static inline void reinit_completion(struct completion *x) { x->done = 0; } extern void wait_for_completion(struct completion *); extern void wait_for_completion_io(struct completion *); extern int wait_for_completion_interruptible(struct completion *x); extern int wait_for_completion_killable(struct completion *x); extern int wait_for_completion_state(struct completion *x, unsigned int state); extern unsigned long wait_for_completion_timeout(struct completion *x, unsigned long timeout); extern unsigned long wait_for_completion_io_timeout(struct completion *x, unsigned long timeout); extern long wait_for_completion_interruptible_timeout( struct completion *x, unsigned long timeout); extern long wait_for_completion_killable_timeout( struct completion *x, unsigned long timeout); extern bool try_wait_for_completion(struct completion *x); extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_on_current_cpu(struct completion *x); extern void complete_all(struct completion *); #endif
191 192 5 192 39 8 194 24 4 2 2 4 4 4 4 4 193 7 6 1 5 3 2 1 6 7 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION);
295 215 2286 19 18 2650 92 341 50 19 9 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_H #define _LINUX_PID_H #include <linux/pid_types.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/wait.h> /* * What is struct pid? * * A struct pid is the kernel's internal notion of a process identifier. * It refers to individual tasks, process groups, and sessions. While * there are processes attached to it the struct pid lives in a hash * table, so it and then the processes that it refers to can be found * quickly from the numeric pid value. The attached processes may be * quickly accessed by following pointers from struct pid. * * Storing pid_t values in the kernel and referring to them later has a * problem. The process originally with that pid may have exited and the * pid allocator wrapped, and another process could have come along * and been assigned that pid. * * Referring to user space processes by holding a reference to struct * task_struct has a problem. When the user space process exits * the now useless task_struct is still kept. A task_struct plus a * stack consumes around 10K of low kernel memory. More precisely * this is THREAD_SIZE + sizeof(struct task_struct). By comparison * a struct pid is about 64 bytes. * * Holding a reference to struct pid solves both of these problems. * It is small so holding a reference does not consume a lot of * resources, and since a new struct pid is allocated when the numeric pid * value is reused (when pids wrap around) we don't mistakenly refer to new * processes. */ /* * struct upid is used to get the id of the struct pid, as it is * seen in particular namespace. Later the struct pid is found with * find_pid_ns() using the int nr and struct pid_namespace *ns. */ #define RESERVED_PIDS 300 struct upid { int nr; struct pid_namespace *ns; }; struct pid { refcount_t count; unsigned int level; spinlock_t lock; struct dentry *stashed; u64 ino; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; /* wait queue for pidfd notifications */ wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[]; }; extern struct pid init_struct_pid; struct file; struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) { if (pid) refcount_inc(&pid->count); return pid; } extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); static inline bool pid_has_task(struct pid *pid, enum pid_type type) { return !hlist_empty(&pid->tasks[type]); } extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); /* * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); extern void detach_pid(struct task_struct *task, enum pid_type); extern void change_pid(struct task_struct *task, enum pid_type, struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); extern int pid_max; extern int pid_max_min, pid_max_max; /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. * * find_pid_ns() finds the pid in the namespace specified * find_vpid() finds the pid by its virtual id, i.e. in the current namespace * * see also find_task_by_vpid() set in include/linux/sched.h */ extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns); extern struct pid *find_vpid(int nr); /* * Lookup a PID in the hash table, and return with it's count elevated. */ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); /* * ns_of_pid() returns the pid namespace in which the specified pid was * allocated. * * NOTE: * ns_of_pid() is expected to be called for a process (task) that has * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid * is expected to be non-NULL. If @pid is NULL, caller should handle * the resulting NULL pid-ns. */ static inline struct pid_namespace *ns_of_pid(struct pid *pid) { struct pid_namespace *ns = NULL; if (pid) ns = pid->numbers[pid->level].ns; return ns; } /* * is_child_reaper returns true if the pid is the init process * of the current namespace. As this one could be checked before * pid_ns->child_reaper is assigned in copy_process, we check * with the pid number. */ static inline bool is_child_reaper(struct pid *pid) { return pid->numbers[pid->level].nr == 1; } /* * the helpers to get the pid's id seen from different namespaces * * pid_nr() : global id, i.e. the id seen from the init namespace; * pid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * pid_nr_ns() : id seen from the ns specified. * * see also task_xid_nr() etc in include/linux/sched.h */ static inline pid_t pid_nr(struct pid *pid) { pid_t nr = 0; if (pid) nr = pid->numbers[0].nr; return nr; } pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); pid_t pid_vnr(struct pid *pid); #define do_each_pid_task(pid, type, task) \ do { \ if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), \ &(pid)->tasks[type], pid_links[type]) { /* * Both old and new leaders may be attached to * the same pid in the middle of de_thread(). */ #define while_each_pid_task(pid, type, task) \ if (type == PIDTYPE_PID) \ break; \ } \ } while (0) #define do_each_pid_thread(pid, type, task) \ do_each_pid_task(pid, type, task) { \ struct task_struct *tg___ = task; \ for_each_thread(tg___, task) { #define while_each_pid_thread(pid, type, task) \ } \ task = tg___; \ } while_each_pid_task(pid, type, task) static inline struct pid *task_pid(struct task_struct *task) { return task->thread_pid; } /* * the helpers to get the task's different pids as they are seen * from various namespaces * * task_xid_nr() : global id, i.e. the id seen from the init namespace; * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * task_xid_nr_ns() : id seen from the ns specified; * * see also pid_nr() etc in include/linux/pid.h */ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); } static inline pid_t task_pid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } static inline pid_t task_tgid_nr(struct task_struct *tsk) { return tsk->tgid; } /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. * * Test if a process is not yet dead (at most zombie state) * If pid_alive fails, then pointers within the task structure * can be stale and must not be dereferenced. * * Return: 1 if the process is alive. 0 otherwise. */ static inline int pid_alive(const struct task_struct *p) { return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); } static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); } static inline pid_t task_session_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) { pid_t pid = 0; rcu_read_lock(); if (pid_alive(tsk)) pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); rcu_read_unlock(); return pid; } static inline pid_t task_ppid_nr(const struct task_struct *tsk) { return task_ppid_nr_ns(tsk, &init_pid_ns); } /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { return task_pgrp_nr_ns(tsk, &init_pid_ns); } /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. * * Return: 1 if the task structure is init. 0 otherwise. */ static inline int is_global_init(struct task_struct *tsk) { return task_tgid_nr(tsk) == 1; } #endif /* _LINUX_PID_H */
62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 // SPDX-License-Identifier: GPL-2.0-only /* * DCCP connection tracking protocol helper * * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/dccp.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> /* Timeouts are based on values from RFC4340: * * - REQUEST: * * 8.1.2. Client Request * * A client MAY give up on its DCCP-Requests after some time * (3 minutes, for example). * * - RESPOND: * * 8.1.3. Server Response * * It MAY also leave the RESPOND state for CLOSED after a timeout of * not less than 4MSL (8 minutes); * * - PARTOPEN: * * 8.1.5. Handshake Completion * * If the client remains in PARTOPEN for more than 4MSL (8 minutes), * it SHOULD reset the connection with Reset Code 2, "Aborted". * * - OPEN: * * The DCCP timestamp overflows after 11.9 hours. If the connection * stays idle this long the sequence number won't be recognized * as valid anymore. * * - CLOSEREQ/CLOSING: * * 8.3. Termination * * The retransmission timer should initially be set to go off in two * round-trip times and should back off to not less than once every * 64 seconds ... * * - TIMEWAIT: * * 4.3. States * * A server or client socket remains in this state for 2MSL (4 minutes) * after the connection has been town down, ... */ #define DCCP_MSL (2 * 60 * HZ) #ifdef CONFIG_NF_CONNTRACK_PROCFS static const char * const dccp_state_names[] = { [CT_DCCP_NONE] = "NONE", [CT_DCCP_REQUEST] = "REQUEST", [CT_DCCP_RESPOND] = "RESPOND", [CT_DCCP_PARTOPEN] = "PARTOPEN", [CT_DCCP_OPEN] = "OPEN", [CT_DCCP_CLOSEREQ] = "CLOSEREQ", [CT_DCCP_CLOSING] = "CLOSING", [CT_DCCP_TIMEWAIT] = "TIMEWAIT", [CT_DCCP_IGNORE] = "IGNORE", [CT_DCCP_INVALID] = "INVALID", }; #endif #define sNO CT_DCCP_NONE #define sRQ CT_DCCP_REQUEST #define sRS CT_DCCP_RESPOND #define sPO CT_DCCP_PARTOPEN #define sOP CT_DCCP_OPEN #define sCR CT_DCCP_CLOSEREQ #define sCG CT_DCCP_CLOSING #define sTW CT_DCCP_TIMEWAIT #define sIG CT_DCCP_IGNORE #define sIV CT_DCCP_INVALID /* * DCCP state transition table * * The assumption is the same as for TCP tracking: * * We are the man in the middle. All the packets go through us but might * get lost in transit to the destination. It is assumed that the destination * can't receive segments we haven't seen. * * The following states exist: * * NONE: Initial state, expecting Request * REQUEST: Request seen, waiting for Response from server * RESPOND: Response from server seen, waiting for Ack from client * PARTOPEN: Ack after Response seen, waiting for packet other than Response, * Reset or Sync from server * OPEN: Packet other than Response, Reset or Sync seen * CLOSEREQ: CloseReq from server seen, expecting Close from client * CLOSING: Close seen, expecting Reset * TIMEWAIT: Reset seen * IGNORE: Not determinable whether packet is valid * * Some states exist only on one side of the connection: REQUEST, RESPOND, * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to * the one it was in before. * * Packets are marked as ignored (sIG) if we don't know if they're valid * (for example a reincarnation of a connection we didn't notice is dead * already) and the server may send back a connection closing Reset or a * Response. They're also used for Sync/SyncAck packets, which we don't * care about. */ static const u_int8_t dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { [CT_DCCP_ROLE_CLIENT] = { [DCCP_PKT_REQUEST] = { /* * sNO -> sRQ Regular Request * sRQ -> sRQ Retransmitted Request or reincarnation * sRS -> sRS Retransmitted Request (apparently Response * got lost after we saw it) or reincarnation * sPO -> sIG Ignore, conntrack might be out of sync * sOP -> sIG Ignore, conntrack might be out of sync * sCR -> sIG Ignore, conntrack might be out of sync * sCG -> sIG Ignore, conntrack might be out of sync * sTW -> sRQ Reincarnation * * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, }, [DCCP_PKT_RESPONSE] = { /* * sNO -> sIV Invalid * sRQ -> sIG Ignore, might be response to ignored Request * sRS -> sIG Ignore, might be response to ignored Request * sPO -> sIG Ignore, might be response to ignored Request * sOP -> sIG Ignore, might be response to ignored Request * sCR -> sIG Ignore, might be response to ignored Request * sCG -> sIG Ignore, might be response to ignored Request * sTW -> sIV Invalid, reincarnation in reverse direction * goes through sRQ * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, }, [DCCP_PKT_ACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN * sOP -> sOP Regular ACK, remain in OPEN * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV }, [DCCP_PKT_DATA] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) * sOP -> sOP Regular Data packet * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) * sCG -> sCG Data in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, }, [DCCP_PKT_DATAACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) * sPO -> sPO Remain in PARTOPEN state * sOP -> sOP Regular DataAck packet in OPEN state * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV }, [DCCP_PKT_CLOSEREQ] = { /* * CLOSEREQ may only be sent by the server. * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, [DCCP_PKT_CLOSE] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sCG Client-initiated close * sOP -> sCG Client-initiated close * sCR -> sCG Close in response to CloseReq (8.3.) * sCG -> sCG Retransmit * sTW -> sIV Late retransmit, already in TIME_WAIT * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV }, [DCCP_PKT_RESET] = { /* * sNO -> sIV No connection * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) * sRS -> sTW Response received without Request * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) * sOP -> sTW Connection reset * sCR -> sTW Connection reset * sCG -> sTW Connection reset * sTW -> sIG Ignore (don't refresh timer) * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG }, [DCCP_PKT_SYNC] = { /* * We currently ignore Sync packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, [DCCP_PKT_SYNCACK] = { /* * We currently ignore SyncAck packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, }, [CT_DCCP_ROLE_SERVER] = { [DCCP_PKT_REQUEST] = { /* * sNO -> sIV Invalid * sRQ -> sIG Ignore, conntrack might be out of sync * sRS -> sIG Ignore, conntrack might be out of sync * sPO -> sIG Ignore, conntrack might be out of sync * sOP -> sIG Ignore, conntrack might be out of sync * sCR -> sIG Ignore, conntrack might be out of sync * sCG -> sIG Ignore, conntrack might be out of sync * sTW -> sRQ Reincarnation, must reverse roles * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ }, [DCCP_PKT_RESPONSE] = { /* * sNO -> sIV Response without Request * sRQ -> sRS Response to clients Request * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) * sPO -> sIG Response to an ignored Request or late retransmit * sOP -> sIG Ignore, might be response to ignored Request * sCR -> sIG Ignore, might be response to ignored Request * sCG -> sIG Ignore, might be response to ignored Request * sTW -> sIV Invalid, Request from client in sTW moves to sRQ * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV }, [DCCP_PKT_ACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP Enter OPEN state (8.1.5.) * sOP -> sOP Regular Ack in OPEN state * sCR -> sIV Waiting for Close from client * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV }, [DCCP_PKT_DATA] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP Enter OPEN state (8.1.5.) * sOP -> sOP Regular Data packet in OPEN state * sCR -> sIV Waiting for Close from client * sCG -> sCG Data in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV }, [DCCP_PKT_DATAACK] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP Enter OPEN state (8.1.5.) * sOP -> sOP Regular DataAck in OPEN state * sCR -> sIV Waiting for Close from client * sCG -> sCG Data in CLOSING MAY be processed (8.3.) * sTW -> sIV * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV }, [DCCP_PKT_CLOSEREQ] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) * sOP -> sCR CloseReq in OPEN state * sCR -> sCR Retransmit * sCG -> sCR Simultaneous close, client sends another Close * sTW -> sIV Already closed * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV }, [DCCP_PKT_CLOSE] = { /* * sNO -> sIV No connection * sRQ -> sIV No connection * sRS -> sIV No connection * sPO -> sOP -> sCG Move direcly to CLOSING * sOP -> sCG Move to CLOSING * sCR -> sIV Close after CloseReq is invalid * sCG -> sCG Retransmit * sTW -> sIV Already closed * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV }, [DCCP_PKT_RESET] = { /* * sNO -> sIV No connection * sRQ -> sTW Reset in response to Request * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) * sOP -> sTW * sCR -> sTW * sCG -> sTW * sTW -> sIG Ignore (don't refresh timer) * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG }, [DCCP_PKT_SYNC] = { /* * We currently ignore Sync packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, [DCCP_PKT_SYNCACK] = { /* * We currently ignore SyncAck packets * * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, }, }, }; static noinline bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, const struct dccp_hdr *dh, const struct nf_hook_state *hook_state) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; const char *msg; u_int8_t state; state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: dn = nf_dccp_pernet(net); if (dn->dccp_loose == 0) { msg = "not picking up existing connection "; goto out_invalid; } break; case CT_DCCP_REQUEST: break; case CT_DCCP_INVALID: msg = "invalid state transition "; goto out_invalid; } ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.state = CT_DCCP_NONE; ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; ct->proto.dccp.handshake_seq = 0; return true; out_invalid: nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg); return false; } static u64 dccp_ack_seq(const struct dccp_hdr *dh) { const struct dccp_hdr_ack_bits *dhack; dhack = (void *)dh + __dccp_basic_hdr_len(dh); return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); } static bool dccp_error(const struct dccp_hdr *dh, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { static const unsigned long require_seq48 = 1 << DCCP_PKT_REQUEST | 1 << DCCP_PKT_RESPONSE | 1 << DCCP_PKT_CLOSEREQ | 1 << DCCP_PKT_CLOSE | 1 << DCCP_PKT_RESET | 1 << DCCP_PKT_SYNC | 1 << DCCP_PKT_SYNCACK; unsigned int dccp_len = skb->len - dataoff; unsigned int cscov; const char *msg; u8 type; BUILD_BUG_ON(DCCP_PKT_INVALID >= BITS_PER_LONG); if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || dh->dccph_doff * 4 > dccp_len) { msg = "nf_ct_dccp: truncated/malformed packet "; goto out_invalid; } cscov = dccp_len; if (dh->dccph_cscov) { cscov = (dh->dccph_cscov - 1) * 4; if (cscov > dccp_len) { msg = "nf_ct_dccp: bad checksum coverage "; goto out_invalid; } } if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_DCCP, state->pf)) { msg = "nf_ct_dccp: bad checksum "; goto out_invalid; } type = dh->dccph_type; if (type >= DCCP_PKT_INVALID) { msg = "nf_ct_dccp: reserved packet type "; goto out_invalid; } if (test_bit(type, &require_seq48) && !dh->dccph_x) { msg = "nf_ct_dccp: type lacks 48bit sequence numbers"; goto out_invalid; } return false; out_invalid: nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); return true; } struct nf_conntrack_dccp_buf { struct dccp_hdr dh; /* generic header part */ struct dccp_hdr_ext ext; /* optional depending dh->dccph_x */ union { /* depends on header type */ struct dccp_hdr_ack_bits ack; struct dccp_hdr_request req; struct dccp_hdr_response response; struct dccp_hdr_reset rst; } u; }; static struct dccp_hdr * dccp_header_pointer(const struct sk_buff *skb, int offset, const struct dccp_hdr *dh, struct nf_conntrack_dccp_buf *buf) { unsigned int hdrlen = __dccp_hdr_len(dh); if (hdrlen > sizeof(*buf)) return NULL; return skb_header_pointer(skb, offset, hdrlen, buf); } int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conntrack_dccp_buf _dh; u_int8_t type, old_state, new_state; enum ct_dccp_roles role; unsigned int *timeouts; struct dccp_hdr *dh; dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); if (!dh) return -NF_ACCEPT; if (dccp_error(dh, skb, dataoff, state)) return -NF_ACCEPT; /* pull again, including possible 48 bit sequences and subtype header */ dh = dccp_header_pointer(skb, dataoff, dh, &_dh); if (!dh) return -NF_ACCEPT; type = dh->dccph_type; if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) return -NF_ACCEPT; if (type == DCCP_PKT_RESET && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* Tear down connection immediately if only reply is a RESET */ nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } spin_lock_bh(&ct->lock); role = ct->proto.dccp.role[dir]; old_state = ct->proto.dccp.state; new_state = dccp_state_table[role][type][old_state]; switch (new_state) { case CT_DCCP_REQUEST: if (old_state == CT_DCCP_TIMEWAIT && role == CT_DCCP_ROLE_SERVER) { /* Reincarnation in the reverse direction: reopen and * reverse client/server roles. */ ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; } break; case CT_DCCP_RESPOND: if (old_state == CT_DCCP_REQUEST) ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); break; case CT_DCCP_PARTOPEN: if (old_state == CT_DCCP_RESPOND && type == DCCP_PKT_ACK && dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) set_bit(IPS_ASSURED_BIT, &ct->status); break; case CT_DCCP_IGNORE: /* * Connection tracking might be out of sync, so we ignore * packets that might establish a new connection and resync * if the server responds with a valid Response. */ if (ct->proto.dccp.last_dir == !dir && ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && type == DCCP_PKT_RESPONSE) { ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); new_state = CT_DCCP_RESPOND; break; } ct->proto.dccp.last_dir = dir; ct->proto.dccp.last_pkt = type; spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet"); return NF_ACCEPT; case CT_DCCP_INVALID: spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition"); return -NF_ACCEPT; } ct->proto.dccp.last_dir = dir; ct->proto.dccp.last_pkt = type; ct->proto.dccp.state = new_state; spin_unlock_bh(&ct->lock); if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); return NF_ACCEPT; } static bool dccp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.dccp.state) { case CT_DCCP_CLOSEREQ: case CT_DCCP_CLOSING: case CT_DCCP_TIMEWAIT: return true; default: break; } return false; } #ifdef CONFIG_NF_CONNTRACK_PROCFS static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); } #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, cpu_to_be64(ct->proto.dccp.handshake_seq), CTA_PROTOINFO_DCCP_PAD)) goto nla_put_failure; skip_state: nla_nest_end(skb, nest_parms); spin_unlock_bh(&ct->lock); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, }; #define DCCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \ NLA_ALIGN(NLA_HDRLEN + 0)) static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; int err; if (!attr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr, dccp_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_PROTOINFO_DCCP_STATE] || !tb[CTA_PROTOINFO_DCCP_ROLE] || nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { return -EINVAL; } spin_lock_bh(&ct->lock); ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; } else { ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; } if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { ct->proto.dccp.handshake_seq = be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); } spin_unlock_bh(&ct->lock); return 0; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_dccp_net *dn = nf_dccp_pernet(net); unsigned int *timeouts = data; int i; if (!timeouts) timeouts = dn->dccp_timeout; /* set default DCCP timeouts. */ for (i=0; i<CT_DCCP_MAX; i++) timeouts[i] = dn->dccp_timeout[i]; /* there's a 1:1 mapping between attributes and protocol states. */ for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { if (tb[i]) { timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; } } timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST]; return 0; } static int dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; int i; for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) goto nla_put_failure; } return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_dccp_init_net(struct net *net) { struct nf_dccp_net *dn = nf_dccp_pernet(net); /* default values */ dn->dccp_loose = 1; dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; /* timeouts[0] is unused, make it same as SYN_SENT so * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { .l4proto = IPPROTO_DCCP, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = dccp_print_conntrack, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = DCCP_NLATTR_SIZE, .to_nlattr = dccp_to_nlattr, .from_nlattr = nlattr_to_dccp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = dccp_timeout_nlattr_to_obj, .obj_to_nlattr = dccp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_DCCP_MAX, .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, .nla_policy = dccp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ };
43 42 43 43 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H #include <linux/context_tracking_state.h> #include <linux/kprobes.h> #include <asm/debugreg.h> #include <asm/idtentry.h> #include <asm/siginfo.h> /* TRAP_TRACE, ... */ #include <asm/trap_pf.h> #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif extern int ibt_selftest(void); extern int ibt_selftest_noendbr(void); #ifdef CONFIG_X86_F00F_BUG /* For handling the FOOF bug */ void handle_invalid_op(struct pt_regs *regs); #endif static inline int get_si_code(unsigned long condition) { if (condition & DR_STEP) return TRAP_TRACE; else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) return TRAP_HWBKPT; else return TRAP_BRKPT; } extern int panic_on_unrecovered_nmi; void math_emulate(struct math_emu_info *); bool fault_in_kernel_space(unsigned long address); #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(struct pt_regs *regs, unsigned long fault_address, struct stack_info *info); #endif static inline void cond_local_irq_enable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void cond_local_irq_disable(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); } #endif /* _ASM_X86_TRAPS_H */
16 16 16 16 16 16 16 16 4 4 17 17 17 18 17 17 18 4 18 18 16 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dccp/input.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include "ackvec.h" #include "ccid.h" #include "dccp.h" /* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) { __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); sk->sk_data_ready(sk); } static void dccp_fin(struct sock *sk, struct sk_buff *skb) { /* * On receiving Close/CloseReq, both RD/WR shutdown are performed. * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after * receiving the closing segment, but there is no guarantee that such * data will be processed at all. */ sk->sk_shutdown = SHUTDOWN_MASK; sock_set_flag(sk, SOCK_DONE); dccp_enqueue_skb(sk, skb); } static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) { int queued = 0; switch (sk->sk_state) { /* * We ignore Close when received in one of the following states: * - CLOSED (may be a late or duplicate packet) * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) * - RESPOND (already handled by dccp_check_req) */ case DCCP_CLOSING: /* * Simultaneous-close: receiving a Close after sending one. This * can happen if both client and server perform active-close and * will result in an endless ping-pong of crossing and retrans- * mitted Close packets, which only terminates when one of the * nodes times out (min. 64 seconds). Quicker convergence can be * achieved when one of the nodes acts as tie-breaker. * This is ok as both ends are done with data transfer and each * end is just waiting for the other to acknowledge termination. */ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) break; fallthrough; case DCCP_REQUESTING: case DCCP_ACTIVE_CLOSEREQ: dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_done(sk); break; case DCCP_OPEN: case DCCP_PARTOPEN: /* Give waiting application a chance to read pending data */ queued = 1; dccp_fin(sk, skb); dccp_set_state(sk, DCCP_PASSIVE_CLOSE); fallthrough; case DCCP_PASSIVE_CLOSE: /* * Retransmitted Close: we have already enqueued the first one. */ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); } return queued; } static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) { int queued = 0; /* * Step 7: Check for unexpected packet types * If (S.is_server and P.type == CloseReq) * Send Sync packet acknowledging P.seqno * Drop packet and return */ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); return queued; } /* Step 13: process relevant Client states < CLOSEREQ */ switch (sk->sk_state) { case DCCP_REQUESTING: dccp_send_close(sk, 0); dccp_set_state(sk, DCCP_CLOSING); break; case DCCP_OPEN: case DCCP_PARTOPEN: /* Give waiting application a chance to read pending data */ queued = 1; dccp_fin(sk, skb); dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); fallthrough; case DCCP_PASSIVE_CLOSEREQ: sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); } return queued; } static u16 dccp_reset_code_convert(const u8 code) { static const u16 error_code[] = { [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ [DCCP_RESET_CODE_ABORTED] = ECONNRESET, [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, }; return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; } static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) { u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); sk->sk_err = err; /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ dccp_fin(sk, skb); if (err && !sock_flag(sk, SOCK_DEAD)) sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) { struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; if (av == NULL) return; if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); dccp_ackvec_input(av, skb); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) { const struct dccp_sock *dp = dccp_sk(sk); /* Don't deliver to RX CCID when node has shut down read end. */ if (!(sk->sk_shutdown & RCV_SHUTDOWN)) ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); /* * Until the TX queue has been drained, we can not honour SHUT_WR, since * we need received feedback as input to adjust congestion control. */ if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); } static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); struct dccp_sock *dp = dccp_sk(sk); u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; /* * Step 5: Prepare sequence numbers for Sync * If P.type == Sync or P.type == SyncAck, * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, * / * P is valid, so update sequence number variables * accordingly. After this update, P will pass the tests * in Step 6. A SyncAck is generated if necessary in * Step 15 * / * Update S.GSR, S.SWL, S.SWH * Otherwise, * Drop packet and return */ if (dh->dccph_type == DCCP_PKT_SYNC || dh->dccph_type == DCCP_PKT_SYNCACK) { if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) dccp_update_gsr(sk, seqno); else return -1; } /* * Step 6: Check sequence numbers * Let LSWL = S.SWL and LAWL = S.AWL * If P.type == CloseReq or P.type == Close or P.type == Reset, * LSWL := S.GSR + 1, LAWL := S.GAR * If LSWL <= P.seqno <= S.SWH * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), * Update S.GSR, S.SWL, S.SWH * If P.type != Sync, * Update S.GAR */ lswl = dp->dccps_swl; lawl = dp->dccps_awl; if (dh->dccph_type == DCCP_PKT_CLOSEREQ || dh->dccph_type == DCCP_PKT_CLOSE || dh->dccph_type == DCCP_PKT_RESET) { lswl = ADD48(dp->dccps_gsr, 1); lawl = dp->dccps_gar; } if (between48(seqno, lswl, dp->dccps_swh) && (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || between48(ackno, lawl, dp->dccps_awh))) { dccp_update_gsr(sk, seqno); if (dh->dccph_type != DCCP_PKT_SYNC && ackno != DCCP_PKT_WITHOUT_ACK_SEQ && after48(ackno, dp->dccps_gar)) dp->dccps_gar = ackno; } else { unsigned long now = jiffies; /* * Step 6: Check sequence numbers * Otherwise, * If P.type == Reset, * Send Sync packet acknowledging S.GSR * Otherwise, * Send Sync packet acknowledging P.seqno * Drop packet and return * * These Syncs are rate-limited as per RFC 4340, 7.5.4: * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. */ if (time_before(now, (dp->dccps_rate_last + sysctl_dccp_sync_ratelimit))) return -1; DCCP_WARN("Step 6 failed for %s packet, " "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " "sending SYNC...\n", dccp_packet_name(dh->dccph_type), (unsigned long long) lswl, (unsigned long long) seqno, (unsigned long long) dp->dccps_swh, (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists", (unsigned long long) lawl, (unsigned long long) ackno, (unsigned long long) dp->dccps_awh); dp->dccps_rate_last = now; if (dh->dccph_type == DCCP_PKT_RESET) seqno = dp->dccps_gsr; dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); return -1; } return 0; } static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); switch (dccp_hdr(skb)->dccph_type) { case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: /* * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" * - sk_receive_queue is full, use Code 2, "Receive Buffer" */ dccp_enqueue_skb(sk, skb); return 0; case DCCP_PKT_ACK: goto discard; case DCCP_PKT_RESET: /* * Step 9: Process Reset * If P.type == Reset, * Tear down connection * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return */ dccp_rcv_reset(sk, skb); return 0; case DCCP_PKT_CLOSEREQ: if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; case DCCP_PKT_CLOSE: if (dccp_rcv_close(sk, skb)) return 0; goto discard; case DCCP_PKT_REQUEST: /* Step 7 * or (S.is_server and P.type == Response) * or (S.is_client and P.type == Request) * or (S.state >= OPEN and P.type == Request * and P.seqno >= S.OSR) * or (S.state >= OPEN and P.type == Response * and P.seqno >= S.OSR) * or (S.state == RESPOND and P.type == Data), * Send Sync packet acknowledging P.seqno * Drop packet and return */ if (dp->dccps_role != DCCP_ROLE_LISTEN) goto send_sync; goto check_seq; case DCCP_PKT_RESPONSE: if (dp->dccps_role != DCCP_ROLE_CLIENT) goto send_sync; check_seq: if (dccp_delta_seqno(dp->dccps_osr, DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { send_sync: dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); } break; case DCCP_PKT_SYNC: dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNCACK); /* * From RFC 4340, sec. 5.7 * * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets * MAY have non-zero-length application data areas, whose * contents receivers MUST ignore. */ goto discard; } DCCP_INC_STATS(DCCP_MIB_INERRS); discard: __kfree_skb(skb); return 0; } int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { if (dccp_check_seqno(sk, skb)) goto discard; if (dccp_parse_options(sk, NULL, skb)) return 1; dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); discard: __kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(dccp_rcv_established); static int dccp_rcv_request_sent_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { /* * Step 4: Prepare sequence numbers in REQUEST * If S.state == REQUEST, * If (P.type == Response or P.type == Reset) * and S.AWL <= P.ackno <= S.AWH, * / * Set sequence number variables corresponding to the * other endpoint, so P will pass the tests in Step 6 * / * Set S.GSR, S.ISR, S.SWL, S.SWH * / * Response processing continues in Step 10; Reset * processing continues in Step 9 * / */ if (dh->dccph_type == DCCP_PKT_RESPONSE) { const struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); long tstamp = dccp_timestamp(); if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, dp->dccps_awl, dp->dccps_awh)) { dccp_pr_debug("invalid ackno: S.AWL=%llu, " "P.ackno=%llu, S.AWH=%llu\n", (unsigned long long)dp->dccps_awl, (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, (unsigned long long)dp->dccps_awh); goto out_invalid_packet; } /* * If option processing (Step 8) failed, return 1 here so that * dccp_v4_do_rcv() sends a Reset. The Reset code depends on * the option type and is set in dccp_parse_options(). */ if (dccp_parse_options(sk, NULL, skb)) return 1; /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; /* * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH * is done as part of activating the feature values below, since * these settings depend on the local/remote Sequence Window * features, which were undefined or not confirmed until now. */ dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); /* * Step 10: Process REQUEST state (second part) * If S.state == REQUEST, * / * If we get here, P is a valid Response from the * server (see Step 4), and we should move to * PARTOPEN state. PARTOPEN means send an Ack, * don't send Data packets, retransmit Acks * periodically, and always include any Init Cookie * from the Response * / * S.state := PARTOPEN * Set PARTOPEN timer * Continue with S.state == PARTOPEN * / * Step 12 will send the Ack completing the * three-way handshake * / */ dccp_set_state(sk, DCCP_PARTOPEN); /* * If feature negotiation was successful, activate features now; * an activation failure means that this host could not activate * one ore more features (e.g. insufficient memory), which would * leave at least one feature in an undefined state. */ if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) goto unable_to_proceed; /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); } if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || icsk->icsk_accept_queue.rskq_defer_accept) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * * It may be deleted, but with this feature tcpdumps * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ /* * OK, in DCCP we can as well do a similar trick, its * even in the draft, but there is no need for us to * schedule an ack here, as dccp_sendmsg does this for * us, also stated in the draft. -acme */ __kfree_skb(skb); return 0; } dccp_send_ack(sk); return -1; } out_invalid_packet: /* dccp_v4_do_rcv will send a reset */ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; return 1; unable_to_proceed: DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; /* * We mark this socket as no longer usable, so that the loop in * dccp_sendmsg() terminates and the application gets notified. */ dccp_set_state(sk, DCCP_CLOSED); sk->sk_err = ECOMM; return 1; } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; int queued = 0; switch (dh->dccph_type) { case DCCP_PKT_RESET: inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); break; case DCCP_PKT_DATA: if (sk->sk_state == DCCP_RESPOND) break; fallthrough; case DCCP_PKT_DATAACK: case DCCP_PKT_ACK: /* * FIXME: we should be resetting the PARTOPEN (DELACK) timer * here but only if we haven't used the DELACK timer for * something else, like sending a delayed ack for a TIMESTAMP * echo, etc, for now were not clearing it, sending an extra * ACK when there is nothing else to do in DELACK is not a big * deal after all. */ /* Stop the PARTOPEN timer */ if (sk->sk_state == DCCP_PARTOPEN) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ if (likely(sample)) { long delta = dccp_timestamp() - sample; dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); } dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_set_state(sk, DCCP_OPEN); if (dh->dccph_type == DCCP_PKT_DATAACK || dh->dccph_type == DCCP_PKT_DATA) { __dccp_rcv_established(sk, skb, dh, len); queued = 1; /* packet was queued (by __dccp_rcv_established) */ } break; } return queued; } int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct dccp_hdr *dh, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const int old_state = sk->sk_state; bool acceptable; int queued = 0; /* * Step 3: Process LISTEN state * * If S.state == LISTEN, * If P.type == Request or P contains a valid Init Cookie option, * (* Must scan the packet's options to check for Init * Cookies. Only Init Cookies are processed here, * however; other options are processed in Step 8. This * scan need only be performed if the endpoint uses Init * Cookies *) * (* Generate a new socket and switch to that socket *) * Set S := new socket for this port pair * S.state = RESPOND * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init * Cookies Continue with S.state == RESPOND * (* A Response packet will be generated in Step 11 *) * Otherwise, * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { /* It is possible that we process SYN packets from backlog, * so we need to make sure to disable BH and RCU right there. */ rcu_read_lock(); local_bh_disable(); acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; local_bh_enable(); rcu_read_unlock(); if (!acceptable) return 1; consume_skb(skb); return 0; } if (dh->dccph_type == DCCP_PKT_RESET) goto discard; /* Caller (dccp_v4_do_rcv) will send Reset */ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; } else if (sk->sk_state == DCCP_CLOSED) { dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; } /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) goto discard; /* * Step 7: Check for unexpected packet types * If (S.is_server and P.type == Response) * or (S.is_client and P.type == Request) * or (S.state == RESPOND and P.type == Data), * Send Sync packet acknowledging P.seqno * Drop packet and return */ if ((dp->dccps_role != DCCP_ROLE_CLIENT && dh->dccph_type == DCCP_PKT_RESPONSE) || (dp->dccps_role == DCCP_ROLE_CLIENT && dh->dccph_type == DCCP_PKT_REQUEST) || (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); goto discard; } /* Step 8: Process options */ if (dccp_parse_options(sk, NULL, skb)) return 1; /* * Step 9: Process Reset * If P.type == Reset, * Tear down connection * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return */ if (dh->dccph_type == DCCP_PKT_RESET) { dccp_rcv_reset(sk, skb); return 0; } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ if (dccp_rcv_close(sk, skb)) return 0; goto discard; } switch (sk->sk_state) { case DCCP_REQUESTING: queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) return queued; __kfree_skb(skb); return 0; case DCCP_PARTOPEN: /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); fallthrough; case DCCP_RESPOND: queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); break; } if (dh->dccph_type == DCCP_PKT_ACK || dh->dccph_type == DCCP_PKT_DATAACK) { switch (old_state) { case DCCP_PARTOPEN: sk->sk_state_change(sk); sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); break; } } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); goto discard; } if (!queued) { discard: __kfree_skb(skb); } return 0; } EXPORT_SYMBOL_GPL(dccp_rcv_state_process); /** * dccp_sample_rtt - Validate and finalise computation of RTT sample * @sk: socket structure * @delta: number of microseconds between packet and acknowledgment * * The routine is kept generic to work in different contexts. It should be * called immediately when the ACK used for the RTT sample arrives. */ u32 dccp_sample_rtt(struct sock *sk, long delta) { /* dccpor_elapsed_time is either zeroed out or set and > 0 */ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; if (unlikely(delta <= 0)) { DCCP_WARN("unusable RTT sample %ld, using min\n", delta); return DCCP_SANE_RTT_MIN; } if (unlikely(delta > DCCP_SANE_RTT_MAX)) { DCCP_WARN("RTT sample %ld too large, using max\n", delta); return DCCP_SANE_RTT_MAX; } return delta; }
29 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #ifndef _LINUX_BPF_VERIFIER_H #define _LINUX_BPF_VERIFIER_H 1 #include <linux/bpf.h> /* for enum bpf_reg_type */ #include <linux/btf.h> /* for struct btf and btf_id() */ #include <linux/filter.h> /* for MAX_BPF_STACK */ #include <linux/tnum.h> /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit * ensures that umax_value + (int)off + (int)size cannot overflow a u64. */ #define BPF_MAX_VAR_OFF (1 << 29) /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) /* size of tmp_str_buf in bpf_verifier. * we need at least 306 bytes to fit full stack mask representation * (in the "-8,-16,...,-512" form) */ #define TMP_STR_BUF_LEN 320 /* Patch buffer size */ #define INSN_BUF_SIZE 32 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that * "one of this state's descendants read this reg" (and therefore the reg is * relevant for states_equal() checks). * Write marks collect downwards and do not propagate; they record that "the * straight-line code that reached this state (from its parent) wrote this reg" * (and therefore that reads propagated from this state or its descendants * should not propagate to its parent). * A state with a write mark can receive read marks; it just won't propagate * them to its parent, since the write mark is a property, not of the state, * but of the link between it and its parent. See mark_reg_read() and * mark_stack_slot_read() in kernel/bpf/verifier.c. */ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { BPF_ITER_STATE_INVALID, /* for non-first slot */ BPF_ITER_STATE_ACTIVE, BPF_ITER_STATE_DRAINED, }; struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; /* * Fixed part of pointer offset, pointer types only. * Or constant delta between "linked" scalars with the same ID. */ s32 off; union { /* valid when type == PTR_TO_PACKET */ int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ struct { struct bpf_map *map_ptr; /* To distinguish map lookups from outer map * the map_uid is non-zero for registers * pointing to inner maps. */ u32 map_uid; }; /* for PTR_TO_BTF_ID */ struct { struct btf *btf; u32 btf_id; }; struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ u32 mem_size; u32 dynptr_id; /* for dynptr slices */ }; /* For dynptr stack slots */ struct { enum bpf_dynptr_type type; /* A dynptr is 16 bytes so it takes up 2 stack slots. * We need to track which slot is the first slot * to protect against cases where the user may try to * pass in an address starting at the second slot of the * dynptr. */ bool first_slot; } dynptr; /* For bpf_iter stack slots */ struct { /* BTF container and BTF type ID describing * struct bpf_iter_<type> of an iterator state */ struct btf *btf; u32 btf_id; /* packing following two fields to fit iter state into 16 bytes */ enum bpf_iter_state state:2; int depth:30; } iter; /* Max size from any of the above. */ struct { unsigned long raw1; unsigned long raw2; } raw; u32 subprogno; /* for PTR_TO_FUNC */ }; /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset * from the pointed-to object, and is shared with all bpf_reg_states * with the same id as us. */ struct tnum var_off; /* Used to determine if any memory access using this register will * result in a bad access. * These refer to the same value as var_off, not necessarily the actual * contents of the register. */ s64 smin_value; /* minimum possible (s64)value */ s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ s32 s32_min_value; /* minimum possible (s32)value */ s32 s32_max_value; /* maximum possible (s32)value */ u32 u32_min_value; /* minimum possible (u32)value */ u32 u32_max_value; /* maximum possible (u32)value */ /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. * For stack slots that are dynptrs, this is used to track references to * the dynptr to determine proper reference freeing. * Similarly to dynptrs, we use ID to track "belonging" of a reference * to a specific instance of bpf_iter. */ /* * Upper bit of ID is used to remember relationship between "linked" * registers. Example: * r1 = r2; both will have r1->id == r2->id == N * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 */ #define BPF_ADD_CONST (1U << 31) u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and * bpf_tcp_sock(). * * Consider the following where "sk" is a reference counted * pointer returned from "sk = bpf_sk_lookup_tcp();": * * 1: sk = bpf_sk_lookup_tcp(); * 2: if (!sk) { return 0; } * 3: fullsock = bpf_sk_fullsock(sk); * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } * 5: tp = bpf_tcp_sock(fullsock); * 6: if (!tp) { bpf_sk_release(sk); return 0; } * 7: bpf_sk_release(sk); * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain * * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and * "tp" ptr should be invalidated also. In order to do that, * the reg holding "fullsock" and "sk" need to remember * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id * such that the verifier can reset all regs which have * ref_obj_id matching the sk_reg->id. * * sk_reg->ref_obj_id is set to sk_reg->id at line 1. * sk_reg->id will stay as NULL-marking purpose only. * After NULL-marking is done, sk_reg->id can be reset to 0. * * After "fullsock = bpf_sk_fullsock(sk);" at line 3, * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. * * After "tp = bpf_tcp_sock(fullsock);" at line 5, * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id * which is the same as sk_reg->ref_obj_id. * * From the verifier perspective, if sk, fullsock and tp * are not NULL, they are the same ptr with different * reg->type. In particular, bpf_sk_release(tp) is also * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; /* parentage chain for liveness checking */ struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' * is used which is an index in bpf_verifier_state->frame[] array * pointing to bpf_func_state. */ u32 frameno; /* Tracks subreg definition. The stored value is the insn_idx of the * writing insn. This is safe because subreg_def is used before any insn * patching which only happens after main verification finished. */ s32 subreg_def; enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ STACK_MISC, /* BPF program wrote some data into this slot */ STACK_ZERO, /* BPF program wrote constant zero */ /* A dynptr is stored in this stack slot. The type of dynptr * is stored in bpf_stack_state->spilled_ptr.dynptr.type */ STACK_DYNPTR, STACK_ITER, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ #define BPF_REGMASK_ARGS ((1 << BPF_REG_1) | (1 << BPF_REG_2) | \ (1 << BPF_REG_3) | (1 << BPF_REG_4) | \ (1 << BPF_REG_5)) #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) struct bpf_stack_state { struct bpf_reg_state spilled_ptr; u8 slot_type[BPF_REG_SIZE]; }; struct bpf_reference_state { /* Each reference object has a type. Ensure REF_TYPE_PTR is zero to * default to pointer reference on zero initialization of a state. */ enum ref_state_type { REF_TYPE_PTR = 0, REF_TYPE_LOCK, } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). */ int id; /* Instruction where the allocation of this reference occurred. This * is used purely to inform the user of a reference leak. */ int insn_idx; /* Use to keep track of the source object of a lock, to ensure * it matches on unlock. */ void *ptr; }; struct bpf_retval_range { s32 minval; s32 maxval; }; /* state of the program: * type of all registers and stack info */ struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; /* index of call instruction that called into this func */ int callsite; /* stack frame number of this function state from pov of * enclosing bpf_verifier_state. * 0 = main function, 1 = first callee. */ u32 frameno; /* subprog number == index within subprog_info * zero == main subprog */ u32 subprogno; /* Every bpf_timer_start will increment async_entry_cnt. * It's used to distinguish: * void foo(void) { for(;;); } * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; struct bpf_retval_range callback_ret_range; bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; /* For callback calling functions that limit number of possible * callback executions (e.g. bpf_loop) keeps track of current * simulated iteration number. * Value in frame N refers to number of times callback with frame * N+1 was simulated, e.g. for the following call: * * bpf_loop(..., fn, ...); | suppose current frame is N * | fn would be simulated in frame N+1 * | number of simulations is tracked in frame N */ u32 callback_depth; /* The following fields should be last. See copy_func_state() */ int acquired_refs; int active_locks; struct bpf_reference_state *refs; /* The state of the stack. Each element of the array describes BPF_REG_SIZE * (i.e. 8) bytes worth of stack memory. * stack[0] represents bytes [*(r10-8)..*(r10-1)] * stack[1] represents bytes [*(r10-16)..*(r10-9)] * ... * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)] */ struct bpf_stack_state *stack; /* Size of the current stack, in bytes. The stack state is tracked below, in * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. */ int allocated_stack; }; #define MAX_CALL_FRAMES 8 /* instruction history flags, used in bpf_insn_hist_entry.flags field */ enum { /* instruction references stack slot through PTR_TO_STACK register; * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, * 8 bytes per slot, so slot index (spi) is [0, 63]) */ INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ INSN_F_SPI_MASK = 0x3f, /* 6 bits */ INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ }; static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); struct bpf_insn_hist_entry { u32 idx; /* insn idx can't be bigger than 1 million */ u32 prev_idx : 22; /* special flags, e.g., whether insn is doing register stack spill/load */ u32 flags : 10; /* additional registers that need precision tracking when this * jump is backtracked, vector of six 10-bit records */ u64 linked_regs; }; /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; struct bpf_verifier_state *parent; /* * 'branches' field is the number of branches left to explore: * 0 - all possible paths from this state reached bpf_exit or * were safely pruned * 1 - at least one path is being explored. * This state hasn't reached bpf_exit * 2 - at least two paths are being explored. * This state is an immediate parent of two children. * One is fallthrough branch with branches==1 and another * state is pushed into stack (to be explored later) also with * branches==1. The parent of this state has branches==1. * The verifier state tree connected via 'parent' pointer looks like: * 1 * 1 * 2 -> 1 (first 'if' pushed into stack) * 1 * 2 -> 1 (second 'if' pushed into stack) * 1 * 1 * 1 bpf_exit. * * Once do_check() reaches bpf_exit, it calls update_branch_counts() * and the verifier state tree will look: * 1 * 1 * 2 -> 1 (first 'if' pushed into stack) * 1 * 1 -> 1 (second 'if' pushed into stack) * 0 * 0 * 0 bpf_exit. * After pop_stack() the do_check() will resume at second 'if'. * * If is_state_visited() sees a state with branches > 0 it means * there is a loop. If such state is exactly equal to the current state * it's an infinite loop. Note states_equal() checks for states * equivalency, so two states being 'states_equal' does not mean * infinite loop. The exact comparison is provided by * states_maybe_looping() function. It's a stronger pre-check and * much faster than states_equal(). * * This algorithm may not find all possible infinite loops or * loop iteration count may be too high. * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in. */ u32 branches; u32 insn_idx; u32 curframe; bool speculative; bool active_rcu_lock; u32 active_preempt_lock; /* If this state was ever pointed-to by other state's loop_entry field * this flag would be set to true. Used to avoid freeing such states * while they are still in use. */ bool used_as_loop_entry; bool in_sleepable; /* first and last insn idx of this verifier state */ u32 first_insn_idx; u32 last_insn_idx; /* If this state is a part of states loop this field points to some * parent of this state such that: * - it is also a member of the same states loop; * - DFS states traversal starting from initial state visits loop_entry * state before this state. * Used to compute topmost loop entry for state loops. * State loops might appear because of open coded iterators logic. * See get_loop_entry() for more information. */ struct bpf_verifier_state *loop_entry; /* Sub-range of env->insn_hist[] corresponding to this state's * instruction history. * Backtracking is using it to go from last to first. * For most states instruction history is short, 0-3 instructions. * For loops can go up to ~40. */ u32 insn_hist_start; u32 insn_hist_end; u32 dfs_depth; u32 callback_unroll_depth; u32 may_goto_depth; }; #define bpf_get_spilled_reg(slot, frame, mask) \ (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ ((1 << frame->stack[slot].slot_type[BPF_REG_SIZE - 1]) & (mask))) \ ? &frame->stack[slot].spilled_ptr : NULL) /* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ #define bpf_for_each_spilled_reg(iter, frame, reg, mask) \ for (iter = 0, reg = bpf_get_spilled_reg(iter, frame, mask); \ iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame, mask)) #define bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, __mask, __expr) \ ({ \ struct bpf_verifier_state *___vstate = __vst; \ int ___i, ___j; \ for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ struct bpf_reg_state *___regs; \ __state = ___vstate->frame[___i]; \ ___regs = __state->regs; \ for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ __reg = &___regs[___j]; \ (void)(__expr); \ } \ bpf_for_each_spilled_reg(___j, __state, __reg, __mask) { \ if (!__reg) \ continue; \ (void)(__expr); \ } \ } \ }) /* Invoke __expr over regsiters in __vst, setting __state and __reg */ #define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ bpf_for_each_reg_in_vstate_mask(__vst, __state, __reg, 1 << STACK_SPILL, __expr) /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; struct bpf_verifier_state_list *next; int miss_cnt, hit_cnt; }; struct bpf_loop_inline_state { unsigned int initialized:1; /* set to true upon first entry */ unsigned int fit_for_inline:1; /* true if callback function is the same * at each call and flags are always zero */ u32 callback_subprogno; /* valid when fit_for_inline is true */ }; /* pointer and state for maps */ struct bpf_map_ptr_state { struct bpf_map *map_ptr; bool poison; bool unpriv; }; /* Possible states for alu_state member. */ #define BPF_ALU_SANITIZE_SRC (1U << 0) #define BPF_ALU_SANITIZE_DST (1U << 1) #define BPF_ALU_NEG_VALUE (1U << 2) #define BPF_ALU_NON_POINTER (1U << 3) #define BPF_ALU_IMMEDIATE (1U << 4) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map_ptr_state map_ptr_state; s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ struct { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ union { struct { struct btf *btf; u32 btf_id; /* btf_id for struct typed var */ }; u32 mem_size; /* mem_size for non-struct typed var */ }; } btf_var; /* if instruction is a call to bpf_loop this field tracks * the state of the relevant registers to make decision about inlining */ struct bpf_loop_inline_state loop_inline_state; }; union { /* remember the size of type passed to bpf_obj_new to rewrite R1 */ u64 obj_new_size; /* remember the offset of node field within type to rewrite */ u64 insert_off; }; struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ /* true if STX or LDX instruction is a part of a spill/fill * pattern for a bpf_fastcall call. */ u8 fastcall_pattern:1; /* for CALL instructions, a number of spill/fill pairs in the * bpf_fastcall pattern. */ u8 fastcall_spills_num:3; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool jmp_point; bool prune_point; /* ensure we check state equivalence and save state checkpoint and * this instruction, regardless of any heuristics */ bool force_checkpoint; /* true if instruction is a call to a helper function that * accepts callback function as a parameter. */ bool calls_callback; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ #define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */ #define BPF_VERIFIER_TMP_LOG_SIZE 1024 struct bpf_verifier_log { /* Logical start and end positions of a "log window" of the verifier log. * start_pos == 0 means we haven't truncated anything. * Once truncation starts to happen, start_pos + len_total == end_pos, * except during log reset situations, in which (end_pos - start_pos) * might get smaller than len_total (see bpf_vlog_reset()). * Generally, (end_pos - start_pos) gives number of useful data in * user log buffer. */ u64 start_pos; u64 end_pos; char __user *ubuf; u32 level; u32 len_total; u32 len_max; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 #define BPF_LOG_FIXED 8 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && log->level; } #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_arg_info { enum bpf_arg_type arg_type; union { u32 mem_size; u32 btf_id; }; }; enum priv_stack_mode { PRIV_STACK_UNKNOWN, NO_PRIV_STACK, PRIV_STACK_ADAPTIVE, }; struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) * are used for bpf_fastcall spills and fills. */ s16 fastcall_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; struct bpf_verifier_env; struct backtrack_state { struct bpf_verifier_env *env; u32 frame; u32 reg_masks[MAX_CALL_FRAMES]; u64 stack_masks[MAX_CALL_FRAMES]; }; struct bpf_id_pair { u32 old; u32 cur; }; struct bpf_idmap { u32 tmp_id_gen; struct bpf_id_pair map[BPF_ID_MAP_SIZE]; }; struct bpf_idset { u32 count; u32 ids[BPF_ID_MAP_SIZE]; }; /* single container for all structs * one verifier_env per bpf_check() call */ struct bpf_verifier_env { u32 insn_idx; u32 prev_insn_idx; struct bpf_prog *prog; /* eBPF program being verified */ const struct bpf_verifier_ops *ops; struct module *attach_btf_mod; /* The owner module of prog->aux->attach_btf */ struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; /* Allow access to uninitialized stack memory. Writes with fixed offset are * always allowed, so this refers to reads (with fixed or variable offset), * to writes with variable offset and to indirect (helper) accesses. */ bool allow_uninit_stack; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; bool seen_exception; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 2]; /* max + 2 for the fake and exception subprogs */ union { struct bpf_idmap idmap_scratch; struct bpf_idset idset_scratch; }; struct { int *insn_state; int *insn_stack; int cur_stack; } cfg; struct backtrack_state bt; struct bpf_insn_hist_entry *insn_hist; struct bpf_insn_hist_entry *cur_hist_ent; u32 insn_hist_cap; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ u32 prev_insn_processed, insn_processed; /* number of jmps, calls, exits analyzed so far */ u32 prev_jmps_processed, jmps_processed; /* total verification time */ u64 verification_time; /* maximum number of verifier states kept in 'branching' instructions */ u32 max_states_per_insn; /* total number of allocated verifier states */ u32 total_states; /* some states are freed during program analysis. * this is peak number of states. this number dominates kernel * memory consumption during verification */ u32 peak_states; /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; bpfptr_t fd_array; /* bit mask to keep track of whether a register has been accessed * since the last time the function state was printed */ u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; /* buffer used to temporary hold constants as scalar registers */ struct bpf_reg_state fake_reg[2]; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) { return &env->prog->aux->func_info_aux[subprog]; } static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) { return &env->subprog_info[subprog]; } __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, char __user *log_buf, u32 log_size); void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; return cur->frame[cur->curframe]; } static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { return cur_func(env)->regs; } int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx); int bpf_prog_offload_finalize(struct bpf_verifier_env *env); void bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, struct bpf_insn *insn); void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, struct btf *btf, u32 btf_id) { if (tgt_prog) return ((u64)tgt_prog->aux->id << 32) | btf_id; else return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } /* unpack the IDs from the key as constructed above */ static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) { if (obj_id) *obj_id = key >> 32; if (btf_id) *btf_id = key & 0x7FFFFFFF; } int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, u32 btf_id, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); int mark_chain_precision(struct bpf_verifier_env *env, int regno); #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ static inline u32 base_type(u32 type) { return type & BPF_BASE_TYPE_MASK; } /* extract flags from an extended type. See bpf_type_flag in bpf.h. */ static inline u32 type_flag(u32 type) { return type & ~BPF_BASE_TYPE_MASK; } /* only use after check_attach_btf_id() */ static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { return (prog->type == BPF_PROG_TYPE_EXT && prog->aux->saved_dst_prog_type) ? prog->aux->saved_dst_prog_type : prog->type; } static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) { switch (resolve_prog_type(prog)) { case BPF_PROG_TYPE_TRACING: return prog->expected_attach_type != BPF_TRACE_ITER; case BPF_PROG_TYPE_STRUCT_OPS: return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: return false; default: return true; } } #define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED | NON_OWN_REF) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } static inline bool type_is_ptr_alloc_obj(u32 type) { return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; } static inline bool type_is_non_owning_ref(u32 type) { return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; } static inline bool type_is_pkt_pointer(enum bpf_reg_type type) { type = base_type(type); return type == PTR_TO_PACKET || type == PTR_TO_PACKET_META; } static inline bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || type == PTR_TO_SOCK_COMMON || type == PTR_TO_TCP_SOCK || type == PTR_TO_XDP_SOCK; } static inline bool type_may_be_null(u32 type) { return type & PTR_MAYBE_NULL; } static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; } static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) { env->scratched_stack_slots |= 1ULL << spi; } static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) { return (env->scratched_regs >> regno) & 1; } static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) { return (env->scratched_stack_slots >> regno) & 1; } static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) { return env->scratched_regs || env->scratched_stack_slots; } static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) { env->scratched_regs = 0U; env->scratched_stack_slots = 0ULL; } /* Used for printing the entire verifier state. */ static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) { env->scratched_regs = ~0U; env->scratched_stack_slots = ~0ULL; } static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_size) { #ifdef __BIG_ENDIAN off -= spill_size - fill_size; #endif return !(off % BPF_REG_SIZE); } const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); const char *iter_state_str(enum bpf_iter_state state); void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, bool print_all); void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); #endif /* _LINUX_BPF_VERIFIER_H */
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/net/dsa_stubs.h - Stubs for the Distributed Switch Architecture framework */ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <net/dsa.h> #if IS_ENABLED(CONFIG_NET_DSA) extern const struct dsa_stubs *dsa_stubs; struct dsa_stubs { int (*conduit_hwtstamp_validate)(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); }; static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { if (!netdev_uses_dsa(dev)) return 0; /* rtnl_lock() is a sufficient guarantee, because as long as * netdev_uses_dsa() returns true, the dsa_core module is still * registered, and so, dsa_unregister_stubs() couldn't have run. * For netdev_uses_dsa() to start returning false, it would imply that * dsa_conduit_teardown() has executed, which requires rtnl_lock(). */ ASSERT_RTNL(); return dsa_stubs->conduit_hwtstamp_validate(dev, config, extack); } #else static inline int dsa_conduit_hwtstamp_validate(struct net_device *dev, const struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack) { return 0; } #endif
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 /* * Copyright (c) 2004 The Regents of the University of Michigan. * Copyright (c) 2012 Jeff Layton <jlayton@redhat.com> * All rights reserved. * * Andy Adamson <andros@citi.umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include <crypto/hash.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/module.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/clnt.h> #include <linux/nfsd/cld.h> #include "nfsd.h" #include "state.h" #include "vfs.h" #include "netns.h" #define NFSDDBG_FACILITY NFSDDBG_PROC /* Declarations */ struct nfsd4_client_tracking_ops { int (*init)(struct net *); void (*exit)(struct net *); void (*create)(struct nfs4_client *); void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); void (*grace_done)(struct nfsd_net *); uint8_t version; size_t msglen; }; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static int nfs4_save_creds(const struct cred **original_creds) { struct cred *new; new = prepare_creds(); if (!new) return -ENOMEM; new->fsuid = GLOBAL_ROOT_UID; new->fsgid = GLOBAL_ROOT_GID; *original_creds = override_creds(new); put_cred(new); return 0; } static void nfs4_reset_creds(const struct cred *original) { revert_creds(original); } static void md5_to_hex(char *out, char *md5) { int i; for (i=0; i<16; i++) { unsigned char c = md5[i]; *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); } *out = '\0'; } static int nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) { struct xdr_netobj cksum; struct crypto_shash *tfm; int status; dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", clname->len, clname->data); tfm = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(tfm)) { status = PTR_ERR(tfm); goto out_no_tfm; } cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { status = -ENOMEM; goto out; } status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, cksum.data); if (status) goto out; md5_to_hex(dname, cksum.data); status = 0; out: kfree(cksum.data); crypto_free_shash(tfm); out_no_tfm: return status; } /* * If we had an error generating the recdir name for the legacy tracker * then warn the admin. If the error doesn't appear to be transient, * then disable recovery tracking. */ static void legacy_recdir_name_error(struct nfs4_client *clp, int error) { printk(KERN_ERR "NFSD: unable to generate recoverydir " "name (%d).\n", error); /* * if the algorithm just doesn't exist, then disable the recovery * tracker altogether. The crypto libs will generally return this if * FIPS is enabled as well. */ if (error == -ENOENT) { printk(KERN_ERR "NFSD: disabling legacy clientid tracking. " "Reboot recovery will not function correctly!\n"); nfsd4_client_tracking_exit(clp->net); } } static void __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; struct xdr_netobj princhash = { .len = 0, .data = NULL }; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); return; } name.len = len; crp = nfs4_client_to_reclaim(name, princhash, nn); if (!crp) { kfree(name.data); return; } crp->cr_clp = clp; } static void nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char dname[HEXDIR_LEN]; struct dentry *dir, *dentry; int status; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; if (!nn->rec_file) return; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return legacy_recdir_name_error(clp, status); status = nfs4_save_creds(&original_cred); if (status < 0) return; status = mnt_want_write_file(nn->rec_file); if (status) goto out_creds; dir = nn->rec_file->f_path.dentry; /* lock the parent */ inode_lock(d_inode(dir)); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } if (d_really_is_positive(dentry)) /* * In the 4.1 case, where we're called from * reclaim_complete(), records from the previous reboot * may still be left, so this is OK. * * In the 4.0 case, we should never get here; but we may * as well be forgiving and just succeed silently. */ goto out_put; status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) __nfsd4_create_reclaim_record_grace(clp, dname, HEXDIR_LEN, nn); vfs_fsync(nn->rec_file, 0); } else { printk(KERN_ERR "NFSD: failed to write recovery record" " (err %d); please check that %s exists" " and is writeable", status, user_recovery_dirname); } mnt_drop_write_file(nn->rec_file); out_creds: nfs4_reset_creds(original_cred); } typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *); struct name_list { char name[HEXDIR_LEN]; struct list_head list; }; struct nfs4_dir_ctx { struct dir_context ctx; struct list_head names; }; static bool nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct nfs4_dir_ctx *ctx = container_of(__ctx, struct nfs4_dir_ctx, ctx); struct name_list *entry; if (namlen != HEXDIR_LEN - 1) return true; entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); if (entry == NULL) return false; memcpy(entry->name, name, HEXDIR_LEN - 1); entry->name[HEXDIR_LEN - 1] = '\0'; list_add(&entry->list, &ctx->names); return true; } static int nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) { const struct cred *original_cred; struct dentry *dir = nn->rec_file->f_path.dentry; struct nfs4_dir_ctx ctx = { .ctx.actor = nfsd4_build_namelist, .names = LIST_HEAD_INIT(ctx.names) }; struct name_list *entry, *tmp; int status; status = nfs4_save_creds(&original_cred); if (status < 0) return status; status = vfs_llseek(nn->rec_file, 0, SEEK_SET); if (status < 0) { nfs4_reset_creds(original_cred); return status; } status = iterate_dir(nn->rec_file, &ctx.ctx); inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { struct dentry *dentry; dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); break; } status = f(dir, dentry, nn); dput(dentry); } list_del(&entry->list); kfree(entry); } inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name); list_del(&entry->list); kfree(entry); } return status; } static int nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) { struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } status = -ENOENT; if (d_really_is_negative(dentry)) goto out; status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry); out: dput(dentry); out_unlock: inode_unlock(d_inode(dir)); return status; } static void __nfsd4_remove_reclaim_record_grace(const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); return; } name.len = len; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) nfs4_remove_reclaim_record(crp, nn); } static void nfsd4_remove_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char dname[HEXDIR_LEN]; int status; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return legacy_recdir_name_error(clp, status); status = mnt_want_write_file(nn->rec_file); if (status) goto out; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); status = nfs4_save_creds(&original_cred); if (status < 0) goto out_drop_write; status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn); nfs4_reset_creds(original_cred); if (status == 0) { vfs_fsync(nn->rec_file, 0); if (nn->in_grace) __nfsd4_remove_reclaim_record_grace(dname, HEXDIR_LEN, nn); } out_drop_write: mnt_drop_write_file(nn->rec_file); out: if (status) printk("NFSD: Failed to remove expired client state directory" " %.*s\n", HEXDIR_LEN, dname); } static int purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { int status; struct xdr_netobj name; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", __func__, child); /* Keep trying; maybe the others are OK: */ return 0; } name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); goto out; } name.len = HEXDIR_LEN; if (nfs4_has_reclaimed_state(name, nn)) goto out_free; status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); out_free: kfree(name.data); out: /* Keep trying, success or failure: */ return 0; } static void nfsd4_recdir_purge_old(struct nfsd_net *nn) { int status; nn->in_grace = false; if (!nn->rec_file) return; status = mnt_want_write_file(nn->rec_file); if (status) goto out; status = nfsd4_list_rec_dir(purge_old, nn); if (status == 0) vfs_fsync(nn->rec_file, 0); mnt_drop_write_file(nn->rec_file); out: nfs4_release_reclaim(nn); if (status) printk("nfsd4: failed to purge old clients from recovery" " directory %pD\n", nn->rec_file); } static int load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { struct xdr_netobj name; struct xdr_netobj princhash = { .len = 0, .data = NULL }; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", __func__, child); /* Keep trying; maybe the others are OK: */ return 0; } name.data = kmemdup_nul(child->d_name.name, child->d_name.len, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); goto out; } name.len = HEXDIR_LEN; if (!nfs4_client_to_reclaim(name, princhash, nn)) kfree(name.data); out: return 0; } static int nfsd4_recdir_load(struct net *net) { int status; struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->rec_file) return 0; status = nfsd4_list_rec_dir(load_recdir, nn); if (status) printk("nfsd4: failed loading clients from recovery" " directory %pD\n", nn->rec_file); return status; } /* * Hold reference to the recovery directory. */ static int nfsd4_init_recdir(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct cred *original_cred; int status; printk("NFSD: Using %s as the NFSv4 state recovery directory\n", user_recovery_dirname); BUG_ON(nn->rec_file); status = nfs4_save_creds(&original_cred); if (status < 0) { printk("NFSD: Unable to change credentials to find recovery" " directory: error %d\n", status); return status; } nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); if (IS_ERR(nn->rec_file)) { printk("NFSD: unable to find recovery directory %s\n", user_recovery_dirname); status = PTR_ERR(nn->rec_file); nn->rec_file = NULL; } nfs4_reset_creds(original_cred); if (!status) nn->in_grace = true; return status; } static void nfsd4_shutdown_recdir(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (!nn->rec_file) return; fput(nn->rec_file); nn->rec_file = NULL; } static int nfs4_legacy_state_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->reclaim_str_hashtbl) return -ENOMEM; for (i = 0; i < CLIENT_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); nn->reclaim_str_hashtbl_size = 0; return 0; } static void nfs4_legacy_state_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); kfree(nn->reclaim_str_hashtbl); } static int nfsd4_load_reboot_recovery_data(struct net *net) { int status; status = nfsd4_init_recdir(net); if (status) return status; status = nfsd4_recdir_load(net); if (status) nfsd4_shutdown_recdir(net); return status; } static int nfsd4_legacy_tracking_init(struct net *net) { int status; /* XXX: The legacy code won't work in a container */ if (net != &init_net) { pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n"); return -EINVAL; } status = nfs4_legacy_state_init(net); if (status) return status; status = nfsd4_load_reboot_recovery_data(net); if (status) goto err; pr_info("NFSD: Using legacy client tracking operations.\n"); return 0; err: nfs4_legacy_state_shutdown(net); return status; } static void nfsd4_legacy_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_release_reclaim(nn); nfsd4_shutdown_recdir(net); nfs4_legacy_state_shutdown(net); } /* * Change the NFSv4 recovery directory to recdir. */ int nfs4_reset_recoverydir(char *recdir) { int status; struct path path; status = kern_path(recdir, LOOKUP_FOLLOW, &path); if (status) return status; status = -ENOTDIR; if (d_is_dir(path.dentry)) { strcpy(user_recovery_dirname, recdir); status = 0; } path_put(&path); return status; } char * nfs4_recoverydir(void) { return user_recovery_dirname; } static int nfsd4_check_legacy_client(struct nfs4_client *clp) { int status; char dname[HEXDIR_LEN]; struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct xdr_netobj name; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) { legacy_recdir_name_error(clp, status); return status; } /* look for it in the reclaim hashtable otherwise */ name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); goto out_enoent; } name.len = HEXDIR_LEN; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) { set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); crp->cr_clp = clp; return 0; } out_enoent: return -ENOENT; } static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .init = nfsd4_legacy_tracking_init, .exit = nfsd4_legacy_tracking_exit, .create = nfsd4_create_clid_dir, .remove = nfsd4_remove_clid_dir, .check = nfsd4_check_legacy_client, .grace_done = nfsd4_recdir_purge_old, .version = 1, .msglen = 0, }; #endif /* CONFIG_NFSD_LEGACY_CLIENT_TRACKING */ /* Globals */ #define NFSD_PIPE_DIR "nfsd" #define NFSD_CLD_PIPE "cld" /* per-net-ns structure for holding cld upcall info */ struct cld_net { struct rpc_pipe *cn_pipe; spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; struct crypto_shash *cn_tfm; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING bool cn_has_legacy; #endif }; struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; struct completion cu_done; union { struct cld_msg_hdr cu_hdr; struct cld_msg cu_msg; struct cld_msg_v2 cu_msg_v2; } cu_u; }; static int __cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; struct rpc_pipe_msg msg; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; msg.len = nn->client_tracking_ops->msglen; ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { goto out; } wait_for_completion(&cup->cu_done); if (msg.errno < 0) ret = msg.errno; out: return ret; } static int cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; /* * -EAGAIN occurs when pipe is closed and reopened while there are * upcalls queued. */ do { ret = __cld_pipe_upcall(pipe, cmsg, nn); } while (ret == -EAGAIN); return ret; } static ssize_t __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, struct nfsd_net *nn) { uint8_t cmd, princhashlen; struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; if (get_user(cmd, &cmsg->cm_cmd)) { dprintk("%s: error when copying cmd from userspace", __func__); return -EFAULT; } if (cmd == Cld_GraceStart) { if (nn->client_tracking_ops->version >= 2) { const struct cld_clntinfo __user *ci; ci = &cmsg->cm_u.cm_clntinfo; if (get_user(namelen, &ci->cc_name.cn_len)) return -EFAULT; if (namelen == 0 || namelen > NFS4_OPAQUE_LIMIT) { dprintk("%s: invalid namelen (%u)", __func__, namelen); return -EINVAL; } name.data = memdup_user(&ci->cc_name.cn_id, namelen); if (IS_ERR(name.data)) return PTR_ERR(name.data); name.len = namelen; get_user(princhashlen, &ci->cc_princhash.cp_len); if (princhashlen > 0) { princhash.data = memdup_user( &ci->cc_princhash.cp_data, princhashlen); if (IS_ERR(princhash.data)) { kfree(name.data); return PTR_ERR(princhash.data); } princhash.len = princhashlen; } else princhash.len = 0; } else { const struct cld_name __user *cnm; cnm = &cmsg->cm_u.cm_name; if (get_user(namelen, &cnm->cn_len)) return -EFAULT; if (namelen == 0 || namelen > NFS4_OPAQUE_LIMIT) { dprintk("%s: invalid namelen (%u)", __func__, namelen); return -EINVAL; } name.data = memdup_user(&cnm->cn_id, namelen); if (IS_ERR(name.data)) return PTR_ERR(name.data); name.len = namelen; } #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { struct cld_net *cn = nn->cld_net; name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } #endif if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); kfree(princhash.data); return -EFAULT; } return nn->client_tracking_ops->msglen; } return -EFAULT; } static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); struct cld_net *cn = nn->cld_net; int16_t status; if (mlen != nn->client_tracking_ops->msglen) { dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, nn->client_tracking_ops->msglen); return -EINVAL; } /* copy just the xid so we can try to find that */ if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { dprintk("%s: error when copying xid from userspace", __func__); return -EFAULT; } /* * copy the status so we know whether to remove the upcall from the * list (for -EINPROGRESS, we just want to make sure the xid is * valid, not remove the upcall from the list) */ if (get_user(status, &hdr->cm_status)) { dprintk("%s: error when copying status from userspace", __func__); return -EFAULT; } /* walk the list and find corresponding xid */ cup = NULL; spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { cup = tmp; if (status != -EINPROGRESS) list_del_init(&cup->cu_list); break; } } spin_unlock(&cn->cn_lock); /* couldn't find upcall? */ if (!cup) { dprintk("%s: couldn't find upcall -- xid=%u\n", __func__, xid); return -EINVAL; } if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); return mlen; } static void cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct cld_msg *cmsg = msg->data; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u.cu_msg); /* errno >= 0 means we got a downcall */ if (msg->errno >= 0) return; complete(&cup->cu_done); } static const struct rpc_pipe_ops cld_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = cld_pipe_downcall, .destroy_msg = cld_pipe_destroy_msg, }; static struct dentry * nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { struct dentry *dir, *dentry; dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR); if (dir == NULL) return ERR_PTR(-ENOENT); dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe); dput(dir); return dentry; } static void nfsd4_cld_unregister_sb(struct rpc_pipe *pipe) { if (pipe->dentry) rpc_unlink(pipe->dentry); } static struct dentry * nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *sb; struct dentry *dentry; sb = rpc_get_sb_net(net); if (!sb) return NULL; dentry = nfsd4_cld_register_sb(sb, pipe); rpc_put_sb_net(net); return dentry; } static void nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *sb; sb = rpc_get_sb_net(net); if (sb) { nfsd4_cld_unregister_sb(pipe); rpc_put_sb_net(net); } } /* Initialize rpc_pipefs pipe for communication with client tracking daemon */ static int __nfsd4_init_cld_pipe(struct net *net) { int ret; struct dentry *dentry; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn; if (nn->cld_net) return 0; cn = kzalloc(sizeof(*cn), GFP_KERNEL); if (!cn) { ret = -ENOMEM; goto err; } cn->cn_pipe = rpc_mkpipe_data(&cld_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(cn->cn_pipe)) { ret = PTR_ERR(cn->cn_pipe); goto err; } spin_lock_init(&cn->cn_lock); INIT_LIST_HEAD(&cn->cn_list); dentry = nfsd4_cld_register_net(net, cn->cn_pipe); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto err_destroy_data; } cn->cn_pipe->dentry = dentry; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING cn->cn_has_legacy = false; #endif nn->cld_net = cn; return 0; err_destroy_data: rpc_destroy_pipe_data(cn->cn_pipe); err: kfree(cn); printk(KERN_ERR "NFSD: unable to create nfsdcld upcall pipe (%d)\n", ret); return ret; } static int nfsd4_init_cld_pipe(struct net *net) { int status; status = __nfsd4_init_cld_pipe(net); if (!status) pr_info("NFSD: Using old nfsdcld client tracking operations.\n"); return status; } static void nfsd4_remove_cld_pipe(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn = nn->cld_net; nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); if (cn->cn_tfm) crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } static struct cld_upcall * alloc_cld_upcall(struct nfsd_net *nn) { struct cld_upcall *new, *tmp; struct cld_net *cn = nn->cld_net; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return new; /* FIXME: hard cap on number in flight? */ restart_search: spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { cn->cn_xid++; spin_unlock(&cn->cn_lock); goto restart_search; } } init_completion(&new->cu_done); new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); new->cu_net = cn; list_add(&new->cu_list, &cn->cn_list); spin_unlock(&cn->cn_lock); dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); return new; } static void free_cld_upcall(struct cld_upcall *victim) { struct cld_net *cn = victim->cu_net; spin_lock(&cn->cn_lock); list_del(&victim->cu_list); spin_unlock(&cn->cn_lock); kfree(victim); } /* Ask daemon to create a new record */ static void nfsd4_cld_create(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if it's already stored */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_Create; cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } free_cld_upcall(cup); out_err: if (ret) printk(KERN_ERR "NFSD: Unable to create client " "record on stable storage: %d\n", ret); } /* Ask daemon to create a new record */ static void nfsd4_cld_create_v2(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct cld_msg_v2 *cmsg; struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; /* Don't upcall if it's already stored */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cmsg = &cup->cu_u.cu_msg_v2; cmsg->cm_cmd = Cld_Create; cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, clp->cl_name.len); if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { ret = -ENOMEM; goto out; } ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), cksum.data); if (ret) { kfree(cksum.data); goto out; } cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, cksum.data, cksum.len); kfree(cksum.data); } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; ret = cld_pipe_upcall(cn->cn_pipe, cmsg, nn); if (!ret) { ret = cmsg->cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } out: free_cld_upcall(cup); out_err: if (ret) pr_err("NFSD: Unable to create client record on stable storage: %d\n", ret); } /* Ask daemon to create a new record */ static void nfsd4_cld_remove(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if it's already removed */ if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_Remove; cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } free_cld_upcall(cup); out_err: if (ret) printk(KERN_ERR "NFSD: Unable to remove client " "record from stable storage: %d\n", ret); } /* * For older nfsdcld's that do not allow us to "slurp" the clients * from the tracking database during startup. * * Check for presence of a record, and update its timestamp */ static int nfsd4_cld_check_v0(struct nfs4_client *clp) { int ret; struct cld_upcall *cup; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; /* Don't upcall if one was already stored during this grace pd */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; cup = alloc_cld_upcall(nn); if (!cup) { printk(KERN_ERR "NFSD: Unable to check client record on " "stable storage: %d\n", -ENOMEM); return -ENOMEM; } cup->cu_u.cu_msg.cm_cmd = Cld_Check; cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } free_cld_upcall(cup); return ret; } /* * For newer nfsdcld's that allow us to "slurp" the clients * from the tracking database during startup. * * Check for presence of a record in the reclaim_str_hashtbl */ static int nfsd4_cld_check(struct nfs4_client *clp) { struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; /* look for it in the reclaim hashtable otherwise */ crp = nfsd4_find_reclaim_client(clp->cl_name, nn); if (crp) goto found; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (nn->cld_net->cn_has_legacy) { int status; char dname[HEXDIR_LEN]; struct xdr_netobj name; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data!\n", __func__); return -ENOENT; } name.len = HEXDIR_LEN; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) goto found; } #endif return -ENOENT; found: crp->cr_clp = clp; return 0; } static int nfsd4_cld_check_v2(struct nfs4_client *clp) { struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; int status; struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; /* look for it in the reclaim hashtable otherwise */ crp = nfsd4_find_reclaim_client(clp->cl_name, nn); if (crp) goto found; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING if (cn->cn_has_legacy) { struct xdr_netobj name; char dname[HEXDIR_LEN]; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) return -ENOENT; name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); if (!name.data) { dprintk("%s: failed to allocate memory for name.data\n", __func__); return -ENOENT; } name.len = HEXDIR_LEN; crp = nfsd4_find_reclaim_client(name, nn); kfree(name.data); if (crp) goto found; } #endif return -ENOENT; found: if (crp->cr_princhash.len) { if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) return -ENOENT; status = crypto_shash_tfm_digest(tfm, principal, strlen(principal), cksum.data); if (status) { kfree(cksum.data); return -ENOENT; } if (memcmp(crp->cr_princhash.data, cksum.data, crp->cr_princhash.len)) { kfree(cksum.data); return -ENOENT; } kfree(cksum.data); } crp->cr_clp = clp; return 0; } static int nfsd4_cld_grace_start(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: if (ret) dprintk("%s: Unable to get clients from userspace: %d\n", __func__, ret); return ret; } /* For older nfsdcld's that need cm_gracetime */ static void nfsd4_cld_grace_done_v0(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; cup->cu_u.cu_msg.cm_u.cm_gracetime = nn->boot_time; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: if (ret) printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } /* * For newer nfsdcld's that do not need cm_gracetime. We also need to call * nfs4_release_reclaim() to clear out the reclaim_str_hashtbl. */ static void nfsd4_cld_grace_done(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: nfs4_release_reclaim(nn); if (ret) printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } static int nfs4_cld_state_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; nn->reclaim_str_hashtbl = kmalloc_array(CLIENT_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); if (!nn->reclaim_str_hashtbl) return -ENOMEM; for (i = 0; i < CLIENT_HASH_SIZE; i++) INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]); nn->reclaim_str_hashtbl_size = 0; nn->track_reclaim_completes = true; atomic_set(&nn->nr_reclaim_complete, 0); return 0; } static void nfs4_cld_state_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nn->track_reclaim_completes = false; kfree(nn->reclaim_str_hashtbl); } static bool cld_running(struct nfsd_net *nn) { struct cld_net *cn = nn->cld_net; struct rpc_pipe *pipe = cn->cn_pipe; return pipe->nreaders || pipe->nwriters; } static int nfsd4_cld_get_version(struct nfsd_net *nn) { int ret = 0; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; uint8_t version; cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; if (ret) goto out_free; version = cup->cu_u.cu_msg.cm_u.cm_version; dprintk("%s: userspace returned version %u\n", __func__, version); if (version < 1) version = 1; else if (version > CLD_UPCALL_VERSION) version = CLD_UPCALL_VERSION; switch (version) { case 1: nn->client_tracking_ops = &nfsd4_cld_tracking_ops; break; case 2: nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; break; default: break; } } out_free: free_cld_upcall(cup); out_err: if (ret) dprintk("%s: Unable to get version from userspace: %d\n", __func__, ret); return ret; } static int nfsd4_cld_tracking_init(struct net *net) { int status; struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) return status; status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to * queue an upcall unless we know that nfsdcld is running (because we * want this to fail fast so that nfsd4_client_tracking_init() can try * the next client tracking method). nfsdcld should already be running * before nfsd is started, so the wait here is for nfsdcld to open the * pipefs file we just created. */ while (!(running = cld_running(nn)) && retries--) msleep(100); if (!running) { status = -ETIMEDOUT; goto err_remove; } tfm = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(tfm)) { status = PTR_ERR(tfm); goto err_remove; } nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); status = nfsd4_cld_grace_start(nn); if (status) { if (status == -EOPNOTSUPP) pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); nfs4_release_reclaim(nn); goto err_remove; } else pr_info("NFSD: Using nfsdcld client tracking operations.\n"); return 0; err_remove: nfsd4_remove_cld_pipe(net); err_shutdown: nfs4_cld_state_shutdown(net); return status; } static void nfsd4_cld_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_release_reclaim(nn); nfsd4_remove_cld_pipe(net); nfs4_cld_state_shutdown(net); } /* For older nfsdcld's */ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { .init = nfsd4_init_cld_pipe, .exit = nfsd4_remove_cld_pipe, .create = nfsd4_cld_create, .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v0, .grace_done = nfsd4_cld_grace_done_v0, .version = 1, .msglen = sizeof(struct cld_msg), }; /* For newer nfsdcld's */ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .init = nfsd4_cld_tracking_init, .exit = nfsd4_cld_tracking_exit, .create = nfsd4_cld_create, .remove = nfsd4_cld_remove, .check = nfsd4_cld_check, .grace_done = nfsd4_cld_grace_done, .version = 1, .msglen = sizeof(struct cld_msg), }; /* v2 create/check ops include the principal, if available */ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { .init = nfsd4_cld_tracking_init, .exit = nfsd4_cld_tracking_exit, .create = nfsd4_cld_create_v2, .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v2, .grace_done = nfsd4_cld_grace_done, .version = 2, .msglen = sizeof(struct cld_msg_v2), }; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING /* upcall via usermodehelper */ static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack"; module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog), S_IRUGO|S_IWUSR); MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program"); static bool cltrack_legacy_disable; module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(cltrack_legacy_disable, "Disable legacy recoverydir conversion. Default: false"); #define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" #define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" #define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION=" #define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START=" static char * nfsd4_cltrack_legacy_topdir(void) { int copied; size_t len; char *result; if (cltrack_legacy_disable) return NULL; len = strlen(LEGACY_TOPDIR_ENV_PREFIX) + strlen(nfs4_recoverydir()) + 1; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s", nfs4_recoverydir()); if (copied >= len) { /* just return nothing if output was truncated */ kfree(result); return NULL; } return result; } static char * nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) { int copied; size_t len; char *result; if (cltrack_legacy_disable) return NULL; /* +1 is for '/' between "topdir" and "recdir" */ len = strlen(LEGACY_RECDIR_ENV_PREFIX) + strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/", nfs4_recoverydir()); if (copied > (len - HEXDIR_LEN)) { /* just return nothing if output will be truncated */ kfree(result); return NULL; } copied = nfs4_make_rec_clidname(result + copied, name); if (copied) { kfree(result); return NULL; } return result; } static char * nfsd4_cltrack_client_has_session(struct nfs4_client *clp) { int copied; size_t len; char *result; /* prefix + Y/N character + terminating NULL */ len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c", clp->cl_minorversion ? 'Y' : 'N'); if (copied >= len) { /* just return nothing if output was truncated */ kfree(result); return NULL; } return result; } static char * nfsd4_cltrack_grace_start(time64_t grace_start) { int copied; size_t len; char *result; /* prefix + max width of int64_t string + terminating NULL */ len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1; result = kmalloc(len, GFP_KERNEL); if (!result) return result; copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%lld", grace_start); if (copied >= len) { /* just return nothing if output was truncated */ kfree(result); return NULL; } return result; } static int nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) { char *envp[3]; char *argv[4]; int ret; if (unlikely(!cltrack_prog[0])) { dprintk("%s: cltrack_prog is disabled\n", __func__); return -EACCES; } dprintk("%s: cmd: %s\n", __func__, cmd); dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)"); dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)"); envp[0] = env0; envp[1] = env1; envp[2] = NULL; argv[0] = (char *)cltrack_prog; argv[1] = cmd; argv[2] = arg; argv[3] = NULL; ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); /* * Disable the upcall mechanism if we're getting an ENOENT or EACCES * error. The admin can re-enable it on the fly by using sysfs * once the problem has been fixed. */ if (ret == -ENOENT || ret == -EACCES) { dprintk("NFSD: %s was not found or isn't executable (%d). " "Setting cltrack_prog to blank string!", cltrack_prog, ret); cltrack_prog[0] = '\0'; } dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret); return ret; } static char * bin_to_hex_dup(const unsigned char *src, int srclen) { char *buf; /* +1 for terminating NULL */ buf = kzalloc((srclen * 2) + 1, GFP_KERNEL); if (!buf) return buf; bin2hex(buf, src, srclen); return buf; } static int nfsd4_umh_cltrack_init(struct net *net) { int ret; struct nfsd_net *nn = net_generic(net, nfsd_net_id); char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time); /* XXX: The usermode helper s not working in container yet. */ if (net != &init_net) { pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n"); kfree(grace_start); return -EINVAL; } ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); kfree(grace_start); if (!ret) pr_info("NFSD: Using UMH upcall client tracking operations.\n"); return ret; } static void nfsd4_cltrack_upcall_lock(struct nfs4_client *clp) { wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK, TASK_UNINTERRUPTIBLE); } static void nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp) { clear_and_wake_up_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); } static void nfsd4_umh_cltrack_create(struct nfs4_client *clp) { char *hexid, *has_session, *grace_start; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); /* * With v4.0 clients, there's little difference in outcome between a * create and check operation, and we can end up calling into this * function multiple times per client (once for each openowner). So, * for v4.0 clients skip upcalling once the client has been recorded * on stable storage. * * For v4.1+ clients, the outcome of the two operations is different, * so we must ensure that we upcall for the create operation. v4.1+ * clients call this on RECLAIM_COMPLETE though, so we should only end * up doing a single create upcall per client. */ if (clp->cl_minorversion == 0 && test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } has_session = nfsd4_cltrack_client_has_session(clp); grace_start = nfsd4_cltrack_grace_start(nn->boot_time); nfsd4_cltrack_upcall_lock(clp); if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start)) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); nfsd4_cltrack_upcall_unlock(clp); kfree(has_session); kfree(grace_start); kfree(hexid); } static void nfsd4_umh_cltrack_remove(struct nfs4_client *clp) { char *hexid; if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } nfsd4_cltrack_upcall_lock(clp); if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) && nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0) clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); nfsd4_cltrack_upcall_unlock(clp); kfree(hexid); } static int nfsd4_umh_cltrack_check(struct nfs4_client *clp) { int ret; char *hexid, *has_session, *legacy; if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return -ENOMEM; } has_session = nfsd4_cltrack_client_has_session(clp); legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); nfsd4_cltrack_upcall_lock(clp); if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) { ret = 0; } else { ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy); if (ret == 0) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } nfsd4_cltrack_upcall_unlock(clp); kfree(has_session); kfree(legacy); kfree(hexid); return ret; } static void nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) { char *legacy; char timestr[22]; /* FIXME: better way to determine max size? */ sprintf(timestr, "%lld", nn->boot_time); legacy = nfsd4_cltrack_legacy_topdir(); nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL); kfree(legacy); } static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .init = nfsd4_umh_cltrack_init, .exit = NULL, .create = nfsd4_umh_cltrack_create, .remove = nfsd4_umh_cltrack_remove, .check = nfsd4_umh_cltrack_check, .grace_done = nfsd4_umh_cltrack_grace_done, .version = 1, .msglen = 0, }; static inline int check_for_legacy_methods(int status, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct path path; /* * Next, try the UMH upcall. */ nn->client_tracking_ops = &nfsd4_umh_tracking_ops; status = nn->client_tracking_ops->init(net); if (!status) return status; /* * Finally, See if the recoverydir exists and is a directory. * If it is, then use the legacy ops. */ nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { status = !d_is_dir(path.dentry); path_put(&path); if (status) return -ENOTDIR; status = nn->client_tracking_ops->init(net); } return status; } #else static inline int check_for_legacy_methods(int status, struct net *net) { return status; } #endif /* CONFIG_LEGACY_NFSD_CLIENT_TRACKING */ int nfsd4_client_tracking_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int status; /* just run the init if it the method is already decided */ if (nn->client_tracking_ops) goto do_init; /* First, try to use nfsdcld */ nn->client_tracking_ops = &nfsd4_cld_tracking_ops; status = nn->client_tracking_ops->init(net); if (!status) return status; if (status != -ETIMEDOUT) { nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0; status = nn->client_tracking_ops->init(net); if (!status) return status; } status = check_for_legacy_methods(status, net); if (status) goto out; do_init: status = nn->client_tracking_ops->init(net); out: if (status) { pr_warn("NFSD: Unable to initialize client recovery tracking! (%d)\n", status); pr_warn("NFSD: Is nfsdcld running? If not, enable CONFIG_NFSD_LEGACY_CLIENT_TRACKING.\n"); nn->client_tracking_ops = NULL; } return status; } void nfsd4_client_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); if (nn->client_tracking_ops) { if (nn->client_tracking_ops->exit) nn->client_tracking_ops->exit(net); nn->client_tracking_ops = NULL; } } void nfsd4_client_record_create(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (nn->client_tracking_ops) nn->client_tracking_ops->create(clp); } void nfsd4_client_record_remove(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (nn->client_tracking_ops) nn->client_tracking_ops->remove(clp); } int nfsd4_client_record_check(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); if (nn->client_tracking_ops) return nn->client_tracking_ops->check(clp); return -EOPNOTSUPP; } void nfsd4_record_grace_done(struct nfsd_net *nn) { if (nn->client_tracking_ops) nn->client_tracking_ops->grace_done(nn); } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) return 0; if (!cn) { module_put(THIS_MODULE); return 0; } switch (event) { case RPC_PIPEFS_MOUNT: dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); break; } cn->cn_pipe->dentry = dentry; break; case RPC_PIPEFS_UMOUNT: if (cn->cn_pipe->dentry) nfsd4_cld_unregister_sb(cn->cn_pipe); break; default: ret = -ENOTSUPP; break; } module_put(THIS_MODULE); return ret; } static struct notifier_block nfsd4_cld_block = { .notifier_call = rpc_pipefs_event, }; int register_cld_notifier(void) { WARN_ON(!nfsd_net_id); return rpc_pipefs_notifier_register(&nfsd4_cld_block); } void unregister_cld_notifier(void) { rpc_pipefs_notifier_unregister(&nfsd4_cld_block); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_LINUX_SWAB_H #define _UAPI_LINUX_SWAB_H #include <linux/types.h> #include <linux/stddef.h> #include <asm/bitsperlong.h> #include <asm/swab.h> /* * casts are necessary for constants, because we never know how for sure * how U/UL/ULL map to __u16, __u32, __u64. At least not in a portable way. */ #define ___constant_swab16(x) ((__u16)( \ (((__u16)(x) & (__u16)0x00ffU) << 8) | \ (((__u16)(x) & (__u16)0xff00U) >> 8))) #define ___constant_swab32(x) ((__u32)( \ (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \ (((__u32)(x) & (__u32)0x0000ff00UL) << 8) | \ (((__u32)(x) & (__u32)0x00ff0000UL) >> 8) | \ (((__u32)(x) & (__u32)0xff000000UL) >> 24))) #define ___constant_swab64(x) ((__u64)( \ (((__u64)(x) & (__u64)0x00000000000000ffULL) << 56) | \ (((__u64)(x) & (__u64)0x000000000000ff00ULL) << 40) | \ (((__u64)(x) & (__u64)0x0000000000ff0000ULL) << 24) | \ (((__u64)(x) & (__u64)0x00000000ff000000ULL) << 8) | \ (((__u64)(x) & (__u64)0x000000ff00000000ULL) >> 8) | \ (((__u64)(x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ (((__u64)(x) & (__u64)0x00ff000000000000ULL) >> 40) | \ (((__u64)(x) & (__u64)0xff00000000000000ULL) >> 56))) #define ___constant_swahw32(x) ((__u32)( \ (((__u32)(x) & (__u32)0x0000ffffUL) << 16) | \ (((__u32)(x) & (__u32)0xffff0000UL) >> 16))) #define ___constant_swahb32(x) ((__u32)( \ (((__u32)(x) & (__u32)0x00ff00ffUL) << 8) | \ (((__u32)(x) & (__u32)0xff00ff00UL) >> 8))) /* * Implement the following as inlines, but define the interface using * macros to allow constant folding when possible: * ___swab16, ___swab32, ___swab64, ___swahw32, ___swahb32 */ static inline __attribute_const__ __u16 __fswab16(__u16 val) { #if defined (__arch_swab16) return __arch_swab16(val); #else return ___constant_swab16(val); #endif } static inline __attribute_const__ __u32 __fswab32(__u32 val) { #if defined(__arch_swab32) return __arch_swab32(val); #else return ___constant_swab32(val); #endif } static inline __attribute_const__ __u64 __fswab64(__u64 val) { #if defined (__arch_swab64) return __arch_swab64(val); #elif defined(__SWAB_64_THRU_32__) __u32 h = val >> 32; __u32 l = val & ((1ULL << 32) - 1); return (((__u64)__fswab32(l)) << 32) | ((__u64)(__fswab32(h))); #else return ___constant_swab64(val); #endif } static inline __attribute_const__ __u32 __fswahw32(__u32 val) { #ifdef __arch_swahw32 return __arch_swahw32(val); #else return ___constant_swahw32(val); #endif } static inline __attribute_const__ __u32 __fswahb32(__u32 val) { #ifdef __arch_swahb32 return __arch_swahb32(val); #else return ___constant_swahb32(val); #endif } /** * __swab16 - return a byteswapped 16-bit value * @x: value to byteswap */ #ifdef __HAVE_BUILTIN_BSWAP16__ #define __swab16(x) (__u16)__builtin_bswap16((__u16)(x)) #else #define __swab16(x) \ (__u16)(__builtin_constant_p(x) ? \ ___constant_swab16(x) : \ __fswab16(x)) #endif /** * __swab32 - return a byteswapped 32-bit value * @x: value to byteswap */ #ifdef __HAVE_BUILTIN_BSWAP32__ #define __swab32(x) (__u32)__builtin_bswap32((__u32)(x)) #else #define __swab32(x) \ (__u32)(__builtin_constant_p(x) ? \ ___constant_swab32(x) : \ __fswab32(x)) #endif /** * __swab64 - return a byteswapped 64-bit value * @x: value to byteswap */ #ifdef __HAVE_BUILTIN_BSWAP64__ #define __swab64(x) (__u64)__builtin_bswap64((__u64)(x)) #else #define __swab64(x) \ (__u64)(__builtin_constant_p(x) ? \ ___constant_swab64(x) : \ __fswab64(x)) #endif static __always_inline unsigned long __swab(const unsigned long y) { #if __BITS_PER_LONG == 64 return __swab64(y); #else /* __BITS_PER_LONG == 32 */ return __swab32(y); #endif } /** * __swahw32 - return a word-swapped 32-bit value * @x: value to wordswap * * __swahw32(0x12340000) is 0x00001234 */ #define __swahw32(x) \ (__builtin_constant_p((__u32)(x)) ? \ ___constant_swahw32(x) : \ __fswahw32(x)) /** * __swahb32 - return a high and low byte-swapped 32-bit value * @x: value to byteswap * * __swahb32(0x12345678) is 0x34127856 */ #define __swahb32(x) \ (__builtin_constant_p((__u32)(x)) ? \ ___constant_swahb32(x) : \ __fswahb32(x)) /** * __swab16p - return a byteswapped 16-bit value from a pointer * @p: pointer to a naturally-aligned 16-bit value */ static __always_inline __u16 __swab16p(const __u16 *p) { #ifdef __arch_swab16p return __arch_swab16p(p); #else return __swab16(*p); #endif } /** * __swab32p - return a byteswapped 32-bit value from a pointer * @p: pointer to a naturally-aligned 32-bit value */ static __always_inline __u32 __swab32p(const __u32 *p) { #ifdef __arch_swab32p return __arch_swab32p(p); #else return __swab32(*p); #endif } /** * __swab64p - return a byteswapped 64-bit value from a pointer * @p: pointer to a naturally-aligned 64-bit value */ static __always_inline __u64 __swab64p(const __u64 *p) { #ifdef __arch_swab64p return __arch_swab64p(p); #else return __swab64(*p); #endif } /** * __swahw32p - return a wordswapped 32-bit value from a pointer * @p: pointer to a naturally-aligned 32-bit value * * See __swahw32() for details of wordswapping. */ static inline __u32 __swahw32p(const __u32 *p) { #ifdef __arch_swahw32p return __arch_swahw32p(p); #else return __swahw32(*p); #endif } /** * __swahb32p - return a high and low byteswapped 32-bit value from a pointer * @p: pointer to a naturally-aligned 32-bit value * * See __swahb32() for details of high/low byteswapping. */ static inline __u32 __swahb32p(const __u32 *p) { #ifdef __arch_swahb32p return __arch_swahb32p(p); #else return __swahb32(*p); #endif } /** * __swab16s - byteswap a 16-bit value in-place * @p: pointer to a naturally-aligned 16-bit value */ static inline void __swab16s(__u16 *p) { #ifdef __arch_swab16s __arch_swab16s(p); #else *p = __swab16p(p); #endif } /** * __swab32s - byteswap a 32-bit value in-place * @p: pointer to a naturally-aligned 32-bit value */ static __always_inline void __swab32s(__u32 *p) { #ifdef __arch_swab32s __arch_swab32s(p); #else *p = __swab32p(p); #endif } /** * __swab64s - byteswap a 64-bit value in-place * @p: pointer to a naturally-aligned 64-bit value */ static __always_inline void __swab64s(__u64 *p) { #ifdef __arch_swab64s __arch_swab64s(p); #else *p = __swab64p(p); #endif } /** * __swahw32s - wordswap a 32-bit value in-place * @p: pointer to a naturally-aligned 32-bit value * * See __swahw32() for details of wordswapping */ static inline void __swahw32s(__u32 *p) { #ifdef __arch_swahw32s __arch_swahw32s(p); #else *p = __swahw32p(p); #endif } /** * __swahb32s - high and low byteswap a 32-bit value in-place * @p: pointer to a naturally-aligned 32-bit value * * See __swahb32() for details of high and low byte swapping */ static inline void __swahb32s(__u32 *p) { #ifdef __arch_swahb32s __arch_swahb32s(p); #else *p = __swahb32p(p); #endif } #endif /* _UAPI_LINUX_SWAB_H */
8 8 8 5 5 1 1 1 1 1 4 4 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2018 Facebook */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/sock_diag.h> #include <net/sock_reuseport.h> #include <linux/btf_ids.h> struct reuseport_array { struct bpf_map map; struct sock __rcu *ptrs[]; }; static struct reuseport_array *reuseport_array(struct bpf_map *map) { return (struct reuseport_array *)map; } /* The caller must hold the reuseport_lock */ void bpf_sk_reuseport_detach(struct sock *sk) { struct sock __rcu **socks; write_lock_bh(&sk->sk_callback_lock); socks = __locked_read_sk_user_data_with_flags(sk, SK_USER_DATA_BPF); if (socks) { WRITE_ONCE(sk->sk_user_data, NULL); /* * Do not move this NULL assignment outside of * sk->sk_callback_lock because there is * a race with reuseport_array_free() * which does not hold the reuseport_lock. */ RCU_INIT_POINTER(*socks, NULL); } write_unlock_bh(&sk->sk_callback_lock); } static int reuseport_array_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32) && attr->value_size != sizeof(u64)) return -EINVAL; return array_map_alloc_check(attr); } static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; if (unlikely(index >= array->map.max_entries)) return NULL; return rcu_dereference(array->ptrs[index]); } /* Called from syscall only */ static long reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; struct sock *sk; int err; if (index >= map->max_entries) return -E2BIG; if (!rcu_access_pointer(array->ptrs[index])) return -ENOENT; spin_lock_bh(&reuseport_lock); sk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); if (sk) { write_lock_bh(&sk->sk_callback_lock); WRITE_ONCE(sk->sk_user_data, NULL); RCU_INIT_POINTER(array->ptrs[index], NULL); write_unlock_bh(&sk->sk_callback_lock); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&reuseport_lock); return err; } static void reuseport_array_free(struct bpf_map *map) { struct reuseport_array *array = reuseport_array(map); struct sock *sk; u32 i; /* * ops->map_*_elem() will not be able to access this * array now. Hence, this function only races with * bpf_sk_reuseport_detach() which was triggered by * close() or disconnect(). * * This function and bpf_sk_reuseport_detach() are * both removing sk from "array". Who removes it * first does not matter. * * The only concern here is bpf_sk_reuseport_detach() * may access "array" which is being freed here. * bpf_sk_reuseport_detach() access this "array" * through sk->sk_user_data _and_ with sk->sk_callback_lock * held which is enough because this "array" is not freed * until all sk->sk_user_data has stopped referencing this "array". * * Hence, due to the above, taking "reuseport_lock" is not * needed here. */ /* * Since reuseport_lock is not taken, sk is accessed under * rcu_read_lock() */ rcu_read_lock(); for (i = 0; i < map->max_entries; i++) { sk = rcu_dereference(array->ptrs[i]); if (sk) { write_lock_bh(&sk->sk_callback_lock); /* * No need for WRITE_ONCE(). At this point, * no one is reading it without taking the * sk->sk_callback_lock. */ sk->sk_user_data = NULL; write_unlock_bh(&sk->sk_callback_lock); RCU_INIT_POINTER(array->ptrs[i], NULL); } } rcu_read_unlock(); /* * Once reaching here, all sk->sk_user_data is not * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); return &array->map; } int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { struct sock *sk; int err; if (map->value_size != sizeof(u64)) return -ENOSPC; rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; } rcu_read_unlock(); return err; } static int reuseport_array_update_check(const struct reuseport_array *array, const struct sock *nsk, const struct sock *osk, const struct sock_reuseport *nsk_reuse, u32 map_flags) { if (osk && map_flags == BPF_NOEXIST) return -EEXIST; if (!osk && map_flags == BPF_EXIST) return -ENOENT; if (nsk->sk_protocol != IPPROTO_UDP && nsk->sk_protocol != IPPROTO_TCP) return -ENOTSUPP; if (nsk->sk_family != AF_INET && nsk->sk_family != AF_INET6) return -ENOTSUPP; if (nsk->sk_type != SOCK_STREAM && nsk->sk_type != SOCK_DGRAM) return -ENOTSUPP; /* * sk must be hashed (i.e. listening in the TCP case or binded * in the UDP case) and * it must also be a SO_REUSEPORT sk (i.e. reuse cannot be NULL). * * Also, sk will be used in bpf helper that is protected by * rcu_read_lock(). */ if (!sock_flag(nsk, SOCK_RCU_FREE) || !sk_hashed(nsk) || !nsk_reuse) return -EINVAL; /* READ_ONCE because the sk->sk_callback_lock may not be held here */ if (READ_ONCE(nsk->sk_user_data)) return -EBUSY; return 0; } /* * Called from syscall only. * The "nsk" in the fd refcnt. * The "osk" and "reuse" are protected by reuseport_lock. */ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct reuseport_array *array = reuseport_array(map); struct sock *free_osk = NULL, *osk, *nsk; struct sock_reuseport *reuse; u32 index = *(u32 *)key; uintptr_t sk_user_data; struct socket *socket; int err, fd; if (map_flags > BPF_EXIST) return -EINVAL; if (index >= map->max_entries) return -E2BIG; if (map->value_size == sizeof(u64)) { u64 fd64 = *(u64 *)value; if (fd64 > S32_MAX) return -EINVAL; fd = fd64; } else { fd = *(int *)value; } socket = sockfd_lookup(fd, &err); if (!socket) return err; nsk = socket->sk; if (!nsk) { err = -EINVAL; goto put_file; } /* Quick checks before taking reuseport_lock */ err = reuseport_array_update_check(array, nsk, rcu_access_pointer(array->ptrs[index]), rcu_access_pointer(nsk->sk_reuseport_cb), map_flags); if (err) goto put_file; spin_lock_bh(&reuseport_lock); /* * Some of the checks only need reuseport_lock * but it is done under sk_callback_lock also * for simplicity reason. */ write_lock_bh(&nsk->sk_callback_lock); osk = rcu_dereference_protected(array->ptrs[index], lockdep_is_held(&reuseport_lock)); reuse = rcu_dereference_protected(nsk->sk_reuseport_cb, lockdep_is_held(&reuseport_lock)); err = reuseport_array_update_check(array, nsk, osk, reuse, map_flags); if (err) goto put_file_unlock; sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; err = 0; put_file_unlock: write_unlock_bh(&nsk->sk_callback_lock); if (free_osk) { write_lock_bh(&free_osk->sk_callback_lock); WRITE_ONCE(free_osk->sk_user_data, NULL); write_unlock_bh(&free_osk->sk_callback_lock); } spin_unlock_bh(&reuseport_lock); put_file: sockfd_put(socket); return err; } /* Called from syscall */ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct reuseport_array *array = reuseport_array(map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } static u64 reuseport_array_mem_usage(const struct bpf_map *map) { struct reuseport_array *array; return struct_size(array, ptrs, map->max_entries); } BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array) const struct bpf_map_ops reuseport_array_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, .map_lookup_elem = reuseport_array_lookup_elem, .map_get_next_key = reuseport_array_get_next_key, .map_delete_elem = reuseport_array_delete_elem, .map_mem_usage = reuseport_array_mem_usage, .map_btf_id = &reuseport_array_map_btf_ids[0], };
110 116 116 116 5 5 5 5 5 5 5 5 116 5 5 116 111 111 29 30 3 30 31 29 29 29 29 20 32 32 31 31 19 20 29 18 42 31 42 5 5 3 3 3 5 5 5 5 3 3 3 3 3 3 3 49 50 50 50 50 118 145 50 110 111 110 109 110 10 10 3 3 3 3 3 5 5 5 5 5 3 3 3 38 111 99 99 99 99 32 31 31 31 1 3 3 3 3 3 5 5 5 5 5 3 3 3 38 111 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/dev_addr_lists.c - Functions for handling net device lists * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com> * * This file contains functions for working with unicast, multicast and device * addresses lists. */ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/export.h> #include <linux/list.h> #include "dev.h" /* * General list handling functions */ static int __hw_addr_insert(struct netdev_hw_addr_list *list, struct netdev_hw_addr *new, int addr_len) { struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; while (*ins_point) { int diff; ha = rb_entry(*ins_point, struct netdev_hw_addr, node); diff = memcmp(new->addr, ha->addr, addr_len); if (diff == 0) diff = memcmp(&new->type, &ha->type, sizeof(new->type)); parent = *ins_point; if (diff < 0) ins_point = &parent->rb_left; else if (diff > 0) ins_point = &parent->rb_right; else return -EEXIST; } rb_link_node_rcu(&new->node, parent, ins_point); rb_insert_color(&new->node, &list->tree); return 0; } static struct netdev_hw_addr* __hw_addr_create(const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; int alloc_size; alloc_size = sizeof(*ha); if (alloc_size < L1_CACHE_BYTES) alloc_size = L1_CACHE_BYTES; ha = kmalloc(alloc_size, GFP_ATOMIC); if (!ha) return NULL; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; ha->global_use = global; ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; return ha; } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync, int sync_count, bool exclusive) { struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; if (addr_len > MAX_ADDR_LEN) return -EINVAL; while (*ins_point) { int diff; ha = rb_entry(*ins_point, struct netdev_hw_addr, node); diff = memcmp(addr, ha->addr, addr_len); if (diff == 0) diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); parent = *ins_point; if (diff < 0) { ins_point = &parent->rb_left; } else if (diff > 0) { ins_point = &parent->rb_right; } else { if (exclusive) return -EEXIST; if (global) { /* check if addr is already used as global */ if (ha->global_use) return 0; else ha->global_use = true; } if (sync) { if (ha->synced && sync_count) return -EEXIST; else ha->synced++; } ha->refcount++; return 0; } } ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); if (!ha) return -ENOMEM; rb_link_node(&ha->node, parent, ins_point); rb_insert_color(&ha->node, &list->tree); list_add_tail_rcu(&ha->list, &list->list); list->count++; return 0; } static int __hw_addr_add(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, 0, false); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, struct netdev_hw_addr *ha, bool global, bool sync) { if (global && !ha->global_use) return -ENOENT; if (sync && !ha->synced) return -ENOENT; if (global) ha->global_use = false; if (sync) ha->synced--; if (--ha->refcount) return 0; rb_erase(&ha->node, &list->tree); list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); list->count--; return 0; } static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { struct rb_node *node; node = list->tree.rb_node; while (node) { struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); int diff = memcmp(addr, ha->addr, addr_len); if (diff == 0 && addr_type) diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); if (diff < 0) node = node->rb_left; else if (diff > 0) node = node->rb_right; else return ha; } return NULL; } static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); if (!ha) return -ENOENT; return __hw_addr_del_entry(list, ha, global, sync); } static int __hw_addr_del(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false); } static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr *ha, int addr_len) { int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, false, true, ha->sync_cnt, false); if (err && err != -EEXIST) return err; if (!err) { ha->sync_cnt++; ha->refcount++; } return 0; } static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, struct netdev_hw_addr *ha, int addr_len) { int err; err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type, false, true); if (err) return; ha->sync_cnt--; /* address on from list is not marked synced */ __hw_addr_del_entry(from_list, ha, false, false); } static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (ha->sync_cnt == ha->refcount) { __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } else { err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; } } return err; } /* This function only works where there is a strict 1-1 relationship * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (!ha->sync_cnt) { err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; } else if (ha->refcount == 1) __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } return err; } EXPORT_SYMBOL(__hw_addr_sync); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (ha->sync_cnt) __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } } EXPORT_SYMBOL(__hw_addr_unsync); /** * __hw_addr_sync_dev - Synchronize device's multicast list * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed * * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address add/remove * notifications. The unsync function may be NULL in which case * the addresses requiring removal will simply be removed without * any notification to the device. **/ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)) { struct netdev_hw_addr *ha, *tmp; int err; /* first go through and flush out any stale entries */ list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt || ha->refcount != 1) continue; /* if unsync is defined and fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr)) continue; ha->sync_cnt--; __hw_addr_del_entry(list, ha, false, false); } /* go through and sync new entries to the list */ list_for_each_entry_safe(ha, tmp, &list->list, list) { if (ha->sync_cnt) continue; err = sync(dev, ha->addr); if (err) return err; ha->sync_cnt++; ha->refcount++; } return 0; } EXPORT_SYMBOL(__hw_addr_sync_dev); /** * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking * into account references * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address or reference on it should be added * @unsync: function to call if address or some reference on it should removed * * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address or references on it * add/remove notifications. The unsync function may be NULL in which case * the addresses or references on it requiring removal will simply be * removed without any notification to the device. That is responsibility of * the driver to identify and distribute address or references on it between * internal address tables. **/ int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *, int), int (*unsync)(struct net_device *, const unsigned char *, int)) { struct netdev_hw_addr *ha, *tmp; int err, ref_cnt; /* first go through and flush out any unsynced/stale entries */ list_for_each_entry_safe(ha, tmp, &list->list, list) { /* sync if address is not used */ if ((ha->sync_cnt << 1) <= ha->refcount) continue; /* if fails defer unsyncing address */ ref_cnt = ha->refcount - ha->sync_cnt; if (unsync && unsync(dev, ha->addr, ref_cnt)) continue; ha->refcount = (ref_cnt << 1) + 1; ha->sync_cnt = ref_cnt; __hw_addr_del_entry(list, ha, false, false); } /* go through and sync updated/new entries to the list */ list_for_each_entry_safe(ha, tmp, &list->list, list) { /* sync if address added or reused */ if ((ha->sync_cnt << 1) >= ha->refcount) continue; ref_cnt = ha->refcount - ha->sync_cnt; err = sync(dev, ha->addr, ref_cnt); if (err) return err; ha->refcount = ref_cnt << 1; ha->sync_cnt = ref_cnt; } return 0; } EXPORT_SYMBOL(__hw_addr_ref_sync_dev); /** * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on * it from device * @list: address list to remove synchronized addresses (references on it) from * @dev: device to sync * @unsync: function to call if address and references on it should be removed * * Remove all addresses that were added to the device by * __hw_addr_ref_sync_dev(). This function is intended to be called from the * ndo_stop or ndo_open functions on devices that require explicit address (or * references on it) add/remove notifications. If the unsync function pointer * is NULL then this function can be used to just reset the sync_cnt for the * addresses in the list. **/ void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *, int)) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt) continue; /* if fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) continue; ha->refcount -= ha->sync_cnt - 1; ha->sync_cnt = 0; __hw_addr_del_entry(list, ha, false, false); } } EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); /** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync * @unsync: function to call if address should be removed * * Remove all addresses that were added to the device by __hw_addr_sync_dev(). * This function is intended to be called from the ndo_stop or ndo_open * functions on devices that require explicit address add/remove * notifications. If the unsync function pointer is NULL then this function * can be used to just reset the sync_cnt for the addresses in the list. **/ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt) continue; /* if unsync is defined and fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr)) continue; ha->sync_cnt--; __hw_addr_del_entry(list, ha, false, false); } } EXPORT_SYMBOL(__hw_addr_unsync_dev); static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; list->tree = RB_ROOT; list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); } list->count = 0; } void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; list->tree = RB_ROOT; } EXPORT_SYMBOL(__hw_addr_init); /* * Device addresses handling functions */ /* Check that netdev->dev_addr is not written to directly as this would * break the rbtree layout. All changes should go thru dev_addr_set() and co. * Remove this check in mid-2024. */ void dev_addr_check(struct net_device *dev) { if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) return; netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); netdev_warn(dev, "Expected addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr_shadow); netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); } /** * dev_addr_flush - Flush device address list * @dev: device * * Flush device address list and reset ->dev_addr. * * The caller must hold the rtnl_mutex. */ void dev_addr_flush(struct net_device *dev) { /* rtnl_mutex must be held here */ dev_addr_check(dev); __hw_addr_flush(&dev->dev_addrs); dev->dev_addr = NULL; } /** * dev_addr_init - Init device address list * @dev: device * * Init device address list and create the first element, * used by ->dev_addr. * * The caller must hold the rtnl_mutex. */ int dev_addr_init(struct net_device *dev) { unsigned char addr[MAX_ADDR_LEN]; struct netdev_hw_addr *ha; int err; /* rtnl_mutex must be held here */ __hw_addr_init(&dev->dev_addrs); memset(addr, 0, sizeof(addr)); err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), NETDEV_HW_ADDR_T_LAN); if (!err) { /* * Get the first (previously created) address from the list * and set dev_addr pointer to this location. */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); dev->dev_addr = ha->addr; } return err; } void dev_addr_mod(struct net_device *dev, unsigned int offset, const void *addr, size_t len) { struct netdev_hw_addr *ha; dev_addr_check(dev); ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); rb_erase(&ha->node, &dev->dev_addrs.tree); memcpy(&ha->addr[offset], addr, len); memcpy(&dev->dev_addr_shadow[offset], addr, len); WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len)); } EXPORT_SYMBOL(dev_addr_mod); /** * dev_addr_add - Add a device address * @dev: device * @addr: address to add * @addr_type: address type * * Add a device address to the device or increase the reference count if * it already exists. * * The caller must hold the rtnl_mutex. */ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; ASSERT_RTNL(); err = dev_pre_changeaddr_notify(dev, addr, NULL); if (err) return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } EXPORT_SYMBOL(dev_addr_add); /** * dev_addr_del - Release a device address. * @dev: device * @addr: address to delete * @addr_type: address type * * Release reference to a device address and remove it from the device * if the reference count drops to zero. * * The caller must hold the rtnl_mutex. */ int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; struct netdev_hw_addr *ha; ASSERT_RTNL(); /* * We can not remove the first address from the list because * dev->dev_addr points to that. */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); if (!memcmp(ha->addr, addr, dev->addr_len) && ha->type == addr_type && ha->refcount == 1) return -ENOENT; err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } EXPORT_SYMBOL(dev_addr_del); /* * Unicast list handling functions */ /** * dev_uc_add_excl - Add a global secondary unicast address * @dev: device * @addr: address to add */ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST, true, false, 0, true); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_add_excl); /** * dev_uc_add - Add a secondary unicast address * @dev: device * @addr: address to add * * Add a secondary unicast address to the device or increase * the reference count if it already exists. */ int dev_uc_add(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_add); /** * dev_uc_del - Release secondary unicast address. * @dev: device * @addr: address to delete * * Release reference to a secondary unicast address and remove it * from the device if the reference count drops to zero. */ int dev_uc_del(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_del(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_del); /** * dev_uc_sync - Synchronize device's unicast list to another device * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. This function assumes that * addresses will only ever be synced to the @to devices and no other. */ int dev_uc_sync(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_uc_sync); /** * dev_uc_sync_multiple - Synchronize device's unicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have been deleted from the source. The source device * must be locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. It allows for a single source * device to be synced to multiple destination devices. */ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_uc_sync_multiple); /** * dev_uc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device * * Remove all addresses that were added to the destination device by * dev_uc_sync(). This function is intended to be called from the * dev->stop function of layered software devices. */ void dev_uc_unsync(struct net_device *to, struct net_device *from) { if (to->addr_len != from->addr_len) return; /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two * reasons: * 1) This is always called without any addr_list_lock, so as the * outermost one here, it must be 0. * 2) This is called by some callers after unlinking the upper device, * so the dev->lower_level becomes 1 again. * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or * larger. */ netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_uc_unsync); /** * dev_uc_flush - Flush unicast addresses * @dev: device * * Flush unicast addresses. */ void dev_uc_flush(struct net_device *dev) { netif_addr_lock_bh(dev); __hw_addr_flush(&dev->uc); netif_addr_unlock_bh(dev); } EXPORT_SYMBOL(dev_uc_flush); /** * dev_uc_init - Init unicast address list * @dev: device * * Init unicast address list. */ void dev_uc_init(struct net_device *dev) { __hw_addr_init(&dev->uc); } EXPORT_SYMBOL(dev_uc_init); /* * Multicast list handling functions */ /** * dev_mc_add_excl - Add a global secondary multicast address * @dev: device * @addr: address to add */ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, true, false, 0, true); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_mc_add_excl); static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, bool global) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false, 0, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } /** * dev_mc_add - Add a multicast address * @dev: device * @addr: address to add * * Add a multicast address to the device or increase * the reference count if it already exists. */ int dev_mc_add(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, false); } EXPORT_SYMBOL(dev_mc_add); /** * dev_mc_add_global - Add a global multicast address * @dev: device * @addr: address to add * * Add a global multicast address to the device. */ int dev_mc_add_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, true); } EXPORT_SYMBOL(dev_mc_add_global); static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, bool global) { int err; netif_addr_lock_bh(dev); err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } /** * dev_mc_del - Delete a multicast address. * @dev: device * @addr: address to delete * * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ int dev_mc_del(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, false); } EXPORT_SYMBOL(dev_mc_del); /** * dev_mc_del_global - Delete a global multicast address. * @dev: device * @addr: address to delete * * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, true); } EXPORT_SYMBOL(dev_mc_del_global); /** * dev_mc_sync - Synchronize device's multicast list to another device * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the ndo_set_rx_mode * function of layered software devices. */ int dev_mc_sync(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_mc_sync); /** * dev_mc_sync_multiple - Synchronize device's multicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the ndo_set_rx_mode * function of layered software devices. It allows for a single * source device to be synced to multiple destination devices. */ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_mc_sync_multiple); /** * dev_mc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device * * Remove all addresses that were added to the destination device by * dev_mc_sync(). This function is intended to be called from the * dev->stop function of layered software devices. */ void dev_mc_unsync(struct net_device *to, struct net_device *from) { if (to->addr_len != from->addr_len) return; /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_mc_unsync); /** * dev_mc_flush - Flush multicast addresses * @dev: device * * Flush multicast addresses. */ void dev_mc_flush(struct net_device *dev) { netif_addr_lock_bh(dev); __hw_addr_flush(&dev->mc); netif_addr_unlock_bh(dev); } EXPORT_SYMBOL(dev_mc_flush); /** * dev_mc_init - Init multicast address list * @dev: device * * Init multicast address list. */ void dev_mc_init(struct net_device *dev) { __hw_addr_init(&dev->mc); } EXPORT_SYMBOL(dev_mc_init);
4 3 4 3 4 4 2 4 4 3 4 3 4 4 4 4 4 2 4 4 5 1 4 1 10 9 7 7 7 6 2 1 4 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "tctx.h" #include "poll.h" #include "timeout.h" #include "waitid.h" #include "futex.h" #include "cancel.h" struct io_cancel { struct file *file; u64 addr; u32 flags; s32 fd; u8 opcode; }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) /* * Returns true if the request matches the criteria outlined by 'cd'. */ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) { bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; if (req->ctx != cd->ctx) return false; if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) match_user_data = true; if (cd->flags & IORING_ASYNC_CANCEL_ANY) goto check_seq; if (cd->flags & IORING_ASYNC_CANCEL_FD) { if (req->file != cd->file) return false; } if (cd->flags & IORING_ASYNC_CANCEL_OP) { if (req->opcode != cd->opcode) return false; } if (match_user_data && req->cqe.user_data != cd->data) return false; if (cd->flags & IORING_ASYNC_CANCEL_ALL) { check_seq: if (io_cancel_match_sequence(req, cd->seq)) return false; } return true; } static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; return io_cancel_req_match(req, cd); } static int io_async_cancel_one(struct io_uring_task *tctx, struct io_cancel_data *cd) { enum io_wq_cancel cancel_ret; int ret = 0; bool all; if (!tctx || !tctx->io_wq) return -ENOENT; all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; break; case IO_WQ_CANCEL_RUNNING: ret = -EALREADY; break; case IO_WQ_CANCEL_NOTFOUND: ret = -ENOENT; break; } return ret; } int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned issue_flags) { struct io_ring_ctx *ctx = cd->ctx; int ret; WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring); ret = io_async_cancel_one(tctx, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. */ if (!ret) return 0; ret = io_poll_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_waitid_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; ret = io_futex_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); spin_unlock(&ctx->completion_lock); return ret; } int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sqe->off || sqe->splice_fd_in) return -EINVAL; cancel->addr = READ_ONCE(sqe->addr); cancel->flags = READ_ONCE(sqe->cancel_flags); if (cancel->flags & ~CANCEL_FLAGS) return -EINVAL; if (cancel->flags & IORING_ASYNC_CANCEL_FD) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->fd = READ_ONCE(sqe->fd); } if (cancel->flags & IORING_ASYNC_CANCEL_OP) { if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; cancel->opcode = READ_ONCE(sqe->len); } return 0; } static int __io_async_cancel(struct io_cancel_data *cd, struct io_uring_task *tctx, unsigned int issue_flags) { bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); struct io_ring_ctx *ctx = cd->ctx; struct io_tctx_node *node; int ret, nr = 0; do { ret = io_try_cancel(tctx, cd, issue_flags); if (ret == -ENOENT) break; if (!all) return ret; nr++; } while (1); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; nr++; } } io_ring_submit_unlock(ctx, issue_flags); return all ? nr : ret; } int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_cancel *cancel = io_kiocb_to_cmd(req, struct io_cancel); struct io_cancel_data cd = { .ctx = req->ctx, .data = cancel->addr, .flags = cancel->flags, .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; struct io_uring_task *tctx = req->tctx; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { if (req->flags & REQ_F_FIXED_FILE || cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) { req->flags |= REQ_F_FIXED_FILE; req->file = io_file_get_fixed(req, cancel->fd, issue_flags); } else { req->file = io_file_get_normal(req, cancel->fd); } if (!req->file) { ret = -EBADF; goto done; } cd.file = req->file; } ret = __io_async_cancel(&cd, tctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } static int __io_sync_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int fd) { struct io_ring_ctx *ctx = cd->ctx; /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { struct io_rsrc_node *node; node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (unlikely(!node)) return -EBADF; cd->file = io_slot_file(node); if (!cd->file) return -EBADF; } return __io_async_cancel(cd, tctx, 0); } int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) __must_hold(&ctx->uring_lock) { struct io_cancel_data cd = { .ctx = ctx, .seq = atomic_inc_return(&ctx->cancel_seq), }; ktime_t timeout = KTIME_MAX; struct io_uring_sync_cancel_reg sc; struct file *file = NULL; DEFINE_WAIT(wait); int ret, i; if (copy_from_user(&sc, arg, sizeof(sc))) return -EFAULT; if (sc.flags & ~CANCEL_FLAGS) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad); i++) if (sc.pad[i]) return -EINVAL; for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) if (sc.pad2[i]) return -EINVAL; cd.data = sc.addr; cd.flags = sc.flags; cd.opcode = sc.opcode; /* we can grab a normal file descriptor upfront */ if ((cd.flags & IORING_ASYNC_CANCEL_FD) && !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) { file = fget(sc.fd); if (!file) return -EBADF; cd.file = file; } ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); /* found something, done! */ if (ret != -EALREADY) goto out; if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) { struct timespec64 ts = { .tv_sec = sc.timeout.tv_sec, .tv_nsec = sc.timeout.tv_nsec }; timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); } /* * Keep looking until we get -ENOENT. we'll get woken everytime * every time a request completes and will retry the cancelation. */ do { cd.seq = atomic_inc_return(&ctx->cancel_seq); prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE); ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); mutex_unlock(&ctx->uring_lock); if (ret != -EALREADY) break; ret = io_run_task_work_sig(ctx); if (ret < 0) break; ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); if (!ret) { ret = -ETIME; break; } mutex_lock(&ctx->uring_lock); } while (1); finish_wait(&ctx->cq_wait, &wait); mutex_lock(&ctx->uring_lock); if (ret == -ENOENT || ret > 0) ret = 0; out: if (file) fput(file); return ret; }
2 2 2 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2021 NXP */ #include "netlink.h" #include "common.h" struct phc_vclocks_req_info { struct ethnl_req_info base; }; struct phc_vclocks_reply_data { struct ethnl_reply_data base; int num; int *index; }; #define PHC_VCLOCKS_REPDATA(__reply_base) \ container_of(__reply_base, struct phc_vclocks_reply_data, base) const struct nla_policy ethnl_phc_vclocks_get_policy[] = { [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->num = ethtool_get_phc_vclocks(dev, &data->index); ethnl_ops_complete(dev); return ret; } static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); int len = 0; if (data->num > 0) { len += nla_total_size(sizeof(u32)); len += nla_total_size(sizeof(s32) * data->num); } return len; } static int phc_vclocks_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); if (data->num <= 0) return 0; if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) || nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX, sizeof(s32) * data->num, data->index)) return -EMSGSIZE; return 0; } static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base) { const struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); kfree(data->index); } const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = { .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER, .req_info_size = sizeof(struct phc_vclocks_req_info), .reply_data_size = sizeof(struct phc_vclocks_reply_data), .prepare_data = phc_vclocks_prepare_data, .reply_size = phc_vclocks_reply_size, .fill_reply = phc_vclocks_fill_reply, .cleanup_data = phc_vclocks_cleanup_data, };
3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/mballoc.h * * Written by: Alex Tomas <alex@clusterfs.com> * */ #ifndef _EXT4_MBALLOC_H #define _EXT4_MBALLOC_H #include <linux/time.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/proc_fs.h> #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4_jbd2.h" #include "ext4.h" /* * mb_debug() dynamic printk msgs could be used to debug mballoc code. */ #ifdef CONFIG_EXT4_DEBUG #define mb_debug(sb, fmt, ...) \ pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ current->comm, task_pid_nr(current), sb->s_id, \ __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else #define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ /* * How long mballoc can look for a best extent (in found extents) */ #define MB_DEFAULT_MAX_TO_SCAN 200 /* * How long mballoc must look for a best extent */ #define MB_DEFAULT_MIN_TO_SCAN 10 /* * with 's_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ #define MB_DEFAULT_STATS 0 /* * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served * by the stream allocator, which purpose is to pack requests * as close each to other as possible to produce smooth I/O traffic * We use locality group prealloc space for stream request. * We can tune the same via /proc/fs/ext4/<partition>/stream_req */ #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ /* * for which requests use 2^N search using buddies */ #define MB_DEFAULT_ORDER2_REQS 2 /* * default group prealloc size 512 blocks */ #define MB_DEFAULT_GROUP_PREALLOC 512 /* * Number of groups to search linearly before performing group scanning * optimization. */ #define MB_DEFAULT_LINEAR_LIMIT 4 /* * Minimum number of groups that should be present in the file system to perform * group scanning optimizations. */ #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 /* * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular * allocation request. Example, if we have an order 7 request and max trim order * of 3, we can trim this request upto order 4. */ #define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3 /* * Number of valid buddy orders */ #define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) struct ext4_free_data { /* this links the free block information from sb_info */ struct list_head efd_list; /* this links the free block information from group_info */ struct rb_node efd_node; /* group which free block extent belongs */ ext4_group_t efd_group; /* free block extent */ ext4_grpblk_t efd_start_cluster; ext4_grpblk_t efd_count; /* transaction which freed this extent */ tid_t efd_tid; }; struct ext4_prealloc_space { union { struct rb_node inode_node; /* for inode PA rbtree */ struct list_head lg_list; /* for lg PAs */ } pa_node; struct list_head pa_group_list; union { struct list_head pa_tmp_list; struct rcu_head pa_rcu; } u; spinlock_t pa_lock; atomic_t pa_count; unsigned pa_deleted; ext4_fsblk_t pa_pstart; /* phys. block */ ext4_lblk_t pa_lstart; /* log. block */ ext4_grpblk_t pa_len; /* len of preallocated chunk */ ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ union { rwlock_t *inode_lock; /* locks the rbtree holding this PA */ spinlock_t *lg_lock; /* locks the lg list holding this PA */ } pa_node_lock; struct inode *pa_inode; /* used to get the inode during group discard */ }; enum { MB_INODE_PA = 0, MB_GROUP_PA = 1 }; struct ext4_free_extent { ext4_lblk_t fe_logical; ext4_grpblk_t fe_start; /* In cluster units */ ext4_group_t fe_group; ext4_grpblk_t fe_len; /* In cluster units */ }; /* * Locality group: * we try to group all related changes together * so that writeback can flush/allocate them together as well * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC * (512). We store prealloc space into the hash based on the pa_free blocks * order value.ie, fls(pa_free)-1; */ #define PREALLOC_TB_SIZE 10 struct ext4_locality_group { /* for allocator */ /* to serialize allocates */ struct mutex lg_mutex; /* list of preallocations */ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; spinlock_t lg_prealloc_lock; }; struct ext4_allocation_context { struct inode *ac_inode; struct super_block *ac_sb; /* original request */ struct ext4_free_extent ac_o_ex; /* goal request (normalized ac_o_ex) */ struct ext4_free_extent ac_g_ex; /* the best found extent */ struct ext4_free_extent ac_b_ex; /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; /* * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. * This is used while adjusting the PA window and for accounting. */ ext4_grpblk_t ac_orig_goal_len; __u32 ac_flags; /* allocation hints */ __u32 ac_groups_linear_remaining; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; __u16 ac_buddy; __u8 ac_status; __u8 ac_criteria; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ struct folio *ac_bitmap_folio; struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; #define AC_STATUS_CONTINUE 1 #define AC_STATUS_FOUND 2 #define AC_STATUS_BREAK 3 struct ext4_buddy { struct folio *bd_buddy_folio; void *bd_buddy; struct folio *bd_bitmap_folio; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; }; static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { return ext4_group_first_block_no(sb, fex->fe_group) + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, struct ext4_free_extent *fex) { /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); } static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, struct ext4_prealloc_space *pa) { /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); } typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t len, void *priv); int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t end, ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv); #endif
665 263 636 667 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 /* SPDX-License-Identifier: GPL-2.0 */ /* * x86 KFENCE support. * * Copyright (C) 2020, Google LLC. */ #ifndef _ASM_X86_KFENCE_H #define _ASM_X86_KFENCE_H #ifndef MODULE #include <linux/bug.h> #include <linux/kfence.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/tlbflush.h> /* Force 4K pages for __kfence_pool. */ static inline bool arch_kfence_init_pool(void) { unsigned long addr; for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); addr += PAGE_SIZE) { unsigned int level; if (!lookup_address(addr, &level)) return false; if (level != PG_LEVEL_4K) set_memory_4k(addr, 1); } return true; } /* Protect the given page and flush TLB. */ static inline bool kfence_protect_page(unsigned long addr, bool protect) { unsigned int level; pte_t *pte = lookup_address(addr, &level); if (WARN_ON(!pte || level != PG_LEVEL_4K)) return false; /* * We need to avoid IPIs, as we may get KFENCE allocations or faults * with interrupts disabled. Therefore, the below is best-effort, and * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; * lazy fault handling takes care of faults after the page is PRESENT. */ if (protect) set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); /* * Flush this CPU's TLB, assuming whoever did the allocation/free is * likely to continue running on this CPU. */ preempt_disable(); flush_tlb_one_kernel(addr); preempt_enable(); return true; } #endif /* !MODULE */ #endif /* _ASM_X86_KFENCE_H */
94 94 32 94 32 32 32 94 94 16 16 16 497 488 484 488 486 486 481 500 32 16 16 16 127 126 126 94 94 94 16 16 16 16 16 264 266 264 265 218 5 5 5 5 4 5 5 5 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/vmalloc.h> #include <linux/reboot.h> #define CREATE_TRACE_POINTS #include <trace/events/notifier.h> /* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations * and the like. */ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); /* * Notifier chain core routines. The exported routines below * are layered on top of these, with appropriate locking added. */ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n, bool unique_priority) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { WARN(1, "notifier callback %ps already registered", n->notifier_call); return -EEXIST; } if (n->priority > (*nl)->priority) break; if (n->priority == (*nl)->priority && unique_priority) return -EBUSY; nl = &((*nl)->next); } n->next = *nl; rcu_assign_pointer(*nl, n); trace_notifier_register((void *)n->notifier_call); return 0; } static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { if ((*nl) == n) { rcu_assign_pointer(*nl, n->next); trace_notifier_unregister((void *)n->notifier_call); return 0; } nl = &((*nl)->next); } return -ENOENT; } /** * notifier_call_chain - Informs the registered notifiers about an event. * @nl: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * @nr_to_call: Number of notifier functions to be called. Don't care * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. * Return: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v, int nr_to_call, int *nr_calls) { int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; nb = rcu_dereference_raw(*nl); while (nb && nr_to_call) { next_nb = rcu_dereference_raw(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { WARN(1, "Invalid notifier called!"); nb = next_nb; continue; } #endif trace_notifier_run((void *)nb->notifier_call); ret = nb->notifier_call(nb, val, v); if (nr_calls) (*nr_calls)++; if (ret & NOTIFY_STOP_MASK) break; nb = next_nb; nr_to_call--; } return ret; } NOKPROBE_SYMBOL(notifier_call_chain); /** * notifier_call_chain_robust - Inform the registered notifiers about an event * and rollback on error. * @nl: Pointer to head of the blocking notifier chain * @val_up: Value passed unmodified to the notifier function * @val_down: Value passed unmodified to the notifier function when recovering * from an error on @val_up * @v: Pointer passed unmodified to the notifier function * * NOTE: It is important the @nl chain doesn't change between the two * invocations of notifier_call_chain() such that we visit the * exact same notifier callbacks; this rules out any RCU usage. * * Return: the return value of the @val_up call. */ static int notifier_call_chain_robust(struct notifier_block **nl, unsigned long val_up, unsigned long val_down, void *v) { int ret, nr = 0; ret = notifier_call_chain(nl, val_up, v, -1, &nr); if (ret & NOTIFY_STOP_MASK) notifier_call_chain(nl, val_down, v, nr-1, NULL); return ret; } /* * Atomic notifier chain routines. Registration and unregistration * use a spinlock, and call_chain is synchronized by RCU (no locks). */ /** * atomic_notifier_chain_register - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain. * * Returns 0 on success, %-EEXIST on error. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n, false); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); /** * atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * * Adds a notifier to an atomic notifier chain if there is no other * notifier registered using the same priority. * * Returns 0 on success, %-EEXIST or %-EBUSY on error. */ int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n, true); spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio); /** * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an atomic notifier chain. * * Returns zero on success or %-ENOENT on failure. */ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *n) { unsigned long flags; int ret; spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); /** * atomic_notifier_call_chain - Call functions in an atomic notifier chain * @nh: Pointer to head of the atomic notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an atomic context, so they must not block. * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; rcu_read_lock(); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); NOKPROBE_SYMBOL(atomic_notifier_call_chain); /** * atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty * @nh: Pointer to head of the atomic notifier chain * * Checks whether notifier chain is empty. * * Returns true is notifier chain is empty, false otherwise. */ bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh) { return !rcu_access_pointer(nh->head); } /* * Blocking notifier chain routines. All access to the chain is * synchronized by an rwsem. */ static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n, bool unique_priority) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n, unique_priority); down_write(&nh->rwsem); ret = notifier_chain_register(&nh->head, n, unique_priority); up_write(&nh->rwsem); return ret; } /** * blocking_notifier_chain_register - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to a blocking notifier chain. * Must be called in process context. * * Returns 0 on success, %-EEXIST on error. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) { return __blocking_notifier_chain_register(nh, n, false); } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** * blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * * Adds a notifier to an blocking notifier chain if there is no other * notifier registered using the same priority. * * Returns 0 on success, %-EEXIST or %-EBUSY on error. */ int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh, struct notifier_block *n) { return __blocking_notifier_chain_register(nh, n, true); } EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio); /** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a blocking notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call down_write(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); down_write(&nh->rwsem); ret = notifier_chain_unregister(&nh->head, n); up_write(&nh->rwsem); return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust); /** * blocking_notifier_call_chain - Call functions in a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v) { int ret = NOTIFY_DONE; /* * We check the head outside the lock, but if this access is * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); up_read(&nh->rwsem); } return ret; } EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); /* * Raw notifier chain routines. There is no protection; * the caller must provide it. Use at your own risk! */ /** * raw_notifier_chain_register - Add notifier to a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: New entry in notifier chain * * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * * Returns 0 on success, %-EEXIST on error. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_register(&nh->head, n, false); } EXPORT_SYMBOL_GPL(raw_notifier_chain_register); /** * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from a raw notifier chain. * All locking must be provided by the caller. * * Returns zero on success or %-ENOENT on failure. */ int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *n) { return notifier_chain_unregister(&nh->head, n); } EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v) { return notifier_call_chain_robust(&nh->head, val_up, val_down, v); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust); /** * raw_notifier_call_chain - Call functions in a raw notifier chain * @nh: Pointer to head of the raw notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in an undefined context. * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v) { return notifier_call_chain(&nh->head, val, v, -1, NULL); } EXPORT_SYMBOL_GPL(raw_notifier_call_chain); /* * SRCU notifier chain routines. Registration and unregistration * use a mutex, and call_chain is synchronized by SRCU (no locks). */ /** * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: New entry in notifier chain * * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * * Returns 0 on success, %-EEXIST on error. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_register(&nh->head, n, false); mutex_lock(&nh->mutex); ret = notifier_chain_register(&nh->head, n, false); mutex_unlock(&nh->mutex); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); /** * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @n: Entry to remove from notifier chain * * Removes a notifier from an SRCU notifier chain. * Must be called from process context. * * Returns zero on success or %-ENOENT on failure. */ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *n) { int ret; /* * This code gets used during boot-up, when task switching is * not yet working and interrupts must remain disabled. At * such times we must not call mutex_lock(). */ if (unlikely(system_state == SYSTEM_BOOTING)) return notifier_chain_unregister(&nh->head, n); mutex_lock(&nh->mutex); ret = notifier_chain_unregister(&nh->head, n); mutex_unlock(&nh->mutex); synchronize_srcu(&nh->srcu); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); /** * srcu_notifier_call_chain - Call functions in an SRCU notifier chain * @nh: Pointer to head of the SRCU notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * * Calls each function in a notifier chain in turn. The functions * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value * of the last notifier function called. */ int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v) { int ret; int idx; idx = srcu_read_lock(&nh->srcu); ret = notifier_call_chain(&nh->head, val, v, -1, NULL); srcu_read_unlock(&nh->srcu, idx); return ret; } EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); /** * srcu_init_notifier_head - Initialize an SRCU notifier head * @nh: Pointer to head of the srcu notifier chain * * Unlike other sorts of notifier heads, SRCU notifier heads require * dynamic initialization. Be sure to call this routine before * calling any of the other SRCU notifier routines for this head. * * If an SRCU notifier head is deallocated, it must first be cleaned * up by calling srcu_cleanup_notifier_head(). Otherwise the head's * per-cpu data (used by the SRCU mechanism) will leak. */ void srcu_init_notifier_head(struct srcu_notifier_head *nh) { mutex_init(&nh->mutex); if (init_srcu_struct(&nh->srcu) < 0) BUG(); nh->head = NULL; } EXPORT_SYMBOL_GPL(srcu_init_notifier_head); static ATOMIC_NOTIFIER_HEAD(die_chain); int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { .regs = regs, .str = str, .err = err, .trapnr = trap, .signr = sig, }; RCU_LOCKDEP_WARN(!rcu_is_watching(), "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier);
296 296 294 297 297 297 297 296 293 297 221 224 223 223 222 297 294 295 296 297 297 12 296 295 4 296 19 1 1 2 1 1 22 1 21 1 19 2 19 19 4 1 1 1 19 19 19 2 1 19 2 19 19 19 22 21 1 20 21 20 21 2 21 19 19 62 62 62 62 62 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: policy rules. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Thomas Graf <tgraf@suug.ch> * * Fixes: * Rani Assaf : local_rule cannot be deleted * Marc Boucher : routing by fwmark */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/inetdevice.h> #include <linux/init.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/fib_rules.h> #include <linux/indirect_call_wrapper.h> struct fib4_rule { struct fib_rule common; u8 dst_len; u8 src_len; dscp_t dscp; u8 dscp_full:1; /* DSCP or TOS selector */ __be32 src; __be32 srcmask; __be32 dst; __be32 dstmask; #ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; static bool fib4_rule_matchall(const struct fib_rule *rule) { struct fib4_rule *r = container_of(rule, struct fib4_rule, common); if (r->dst_len || r->src_len || r->dscp) return false; return fib_rule_matchall(rule); } bool fib4_rule_default(const struct fib_rule *rule) { if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || rule->l3mdev) return false; if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN && rule->table != RT_TABLE_DEFAULT) return false; return true; } EXPORT_SYMBOL_GPL(fib4_rule_default); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET); } int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, .flags = flags, }; int err; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi4_to_flowi(flp)); err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); #ifdef CONFIG_IP_ROUTE_CLASSID if (arg.rule) res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; else res->tclassid = 0; #endif if (err == -ESRCH) err = -ENETUNREACH; return err; } EXPORT_SYMBOL_GPL(__fib_lookup); INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { int err = -EAGAIN; struct fib_table *tbl; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } rcu_read_lock(); tb_id = fib_rule_get_table(rule, arg); tbl = fib_get_table(rule->fr_net, tb_id); if (tbl) err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *)arg->result, arg->flags); rcu_read_unlock(); return err; } INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg) { struct fib_result *result = arg->result; struct net_device *dev = NULL; if (result->fi) { struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0); dev = nhc->nhc_dev; } /* do not accept result if the route does * not meet the required prefix length */ if (result->prefixlen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device * belonging to a forbidden interface group */ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) goto suppress_route; return false; suppress_route: if (!(arg->flags & FIB_LOOKUP_NOREF)) fib_info_put(result->fi); return true; } INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; struct flowi4 *fl4 = &fl->u.ip4; __be32 daddr = fl4->daddr; __be32 saddr = fl4->saddr; if (((saddr ^ r->src) & r->srcmask) || ((daddr ^ r->dst) & r->dstmask)) return 0; /* When DSCP selector is used we need to match on the entire DSCP field * in the flow information structure. When TOS selector is used we need * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) return 0; if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) return 0; if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) return 0; if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) return 0; return 1; } static struct fib_table *fib_empty_table(struct net *net) { u32 id = 1; while (1) { if (!fib_get_table(net, id)) return fib_new_table(net, id); if (id++ == RT_TABLE_MAX) break; } return NULL; } static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4, struct netlink_ext_ack *extack) { if (rule4->dscp) { NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); return -EINVAL; } rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); rule4->dscp_full = true; return 0; } static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); goto errout; } /* IPv4 currently doesn't handle high order DSCP bits correctly */ if (frh->tos & ~IPTOS_TOS_MASK) { NL_SET_ERR_MSG(extack, "Invalid tos"); goto errout; } rule4->dscp = inet_dsfield_to_dscp(frh->tos); if (tb[FRA_DSCP] && fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0) goto errout; /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) goto errout; if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; table = fib_empty_table(net); if (!table) { err = -ENOBUFS; goto errout; } rule->table = table->tb_id; } } if (frh->src_len) rule4->src = nla_get_in_addr(tb[FRA_SRC]); if (frh->dst_len) rule4->dst = nla_get_in_addr(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); if (rule4->tclassid) atomic_inc(&net->ipv4.fib_num_tclassid_users); } #endif if (fib_rule_requires_fldissect(rule)) net->ipv4.fib_rules_require_fldissect++; rule4->src_len = frh->src_len; rule4->srcmask = inet_make_mask(rule4->src_len); rule4->dst_len = frh->dst_len; rule4->dstmask = inet_make_mask(rule4->dst_len); net->ipv4.fib_has_custom_rules = true; err = 0; errout: return err; } static int fib4_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; int err; /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) goto errout; #ifdef CONFIG_IP_ROUTE_CLASSID if (((struct fib4_rule *)rule)->tclassid) atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif net->ipv4.fib_has_custom_rules = true; if (net->ipv4.fib_rules_require_fldissect && fib_rule_requires_fldissect(rule)) net->ipv4.fib_rules_require_fldissect--; errout: return err; } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (frh->src_len && (rule4->src_len != frh->src_len)) return 0; if (frh->dst_len && (rule4->dst_len != frh->dst_len)) return 0; if (frh->tos && (rule4->dscp_full || inet_dscp_to_dsfield(rule4->dscp) != frh->tos)) return 0; if (tb[FRA_DSCP]) { dscp_t dscp; dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); if (!rule4->dscp_full || rule4->dscp != dscp) return 0; } #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC]))) return 0; if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST]))) return 0; return 1; } static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { struct fib4_rule *rule4 = (struct fib4_rule *) rule; frh->dst_len = rule4->dst_len; frh->src_len = rule4->src_len; if (rule4->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, inet_dscp_to_dsfield(rule4->dscp) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule4->dscp); } if ((rule4->dst_len && nla_put_in_addr(skb, FRA_DST, rule4->dst)) || (rule4->src_len && nla_put_in_addr(skb, FRA_SRC, rule4->src))) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid && nla_put_u32(skb, FRA_FLOW, rule4->tclassid)) goto nla_put_failure; #endif return 0; nla_put_failure: return -ENOBUFS; } static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ + nla_total_size(4) /* flow */ + nla_total_size(1); /* dscp */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { rt_cache_flush(ops->fro_net); } static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, .suppress = fib4_rule_suppress, .match = fib4_rule_match, .configure = fib4_rule_configure, .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .owner = THIS_MODULE, }; static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT); if (err < 0) return err; return 0; } int __net_init fib4_rules_init(struct net *net) { int err; struct fib_rules_ops *ops; ops = fib_rules_register(&fib4_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); err = fib_default_rules_init(ops); if (err < 0) goto fail; net->ipv4.rules_ops = ops; net->ipv4.fib_has_custom_rules = false; net->ipv4.fib_rules_require_fldissect = 0; return 0; fail: /* also cleans all rules already added */ fib_rules_unregister(ops); return err; } void __net_exit fib4_rules_exit(struct net *net) { fib_rules_unregister(net->ipv4.rules_ops); }
28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } /* * Cleanup macro for use with __free(path_put). Avoids dereference and * copying @path unlike DEFINE_FREE(). path_put() will handle the empty * path correctly just ensure @path is initialized: * * struct path path __free(path_put) = {}; */ #define __free_path_put path_put #endif /* _LINUX_PATH_H */
19 19 19 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/once.h> #include <linux/random.h> #include <linux/module.h> struct once_work { struct work_struct work; struct static_key_true *key; struct module *module; }; static void once_deferred(struct work_struct *w) { struct once_work *work; work = container_of(w, struct once_work, work); BUG_ON(!static_key_enabled(work->key)); static_branch_disable(work->key); module_put(work->module); kfree(work); } static void once_disable_jump(struct static_key_true *key, struct module *mod) { struct once_work *w; w = kmalloc(sizeof(*w), GFP_ATOMIC); if (!w) return; INIT_WORK(&w->work, once_deferred); w->key = key; w->module = mod; __module_get(mod); schedule_work(&w->work); } static DEFINE_SPINLOCK(once_lock); bool __do_once_start(bool *done, unsigned long *flags) __acquires(once_lock) { spin_lock_irqsave(&once_lock, *flags); if (*done) { spin_unlock_irqrestore(&once_lock, *flags); /* Keep sparse happy by restoring an even lock count on * this lock. In case we return here, we don't call into * __do_once_done but return early in the DO_ONCE() macro. */ __acquire(once_lock); return false; } return true; } EXPORT_SYMBOL(__do_once_start); void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags, struct module *mod) __releases(once_lock) { *done = true; spin_unlock_irqrestore(&once_lock, *flags); once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_done); static DEFINE_MUTEX(once_mutex); bool __do_once_sleepable_start(bool *done) __acquires(once_mutex) { mutex_lock(&once_mutex); if (*done) { mutex_unlock(&once_mutex); /* Keep sparse happy by restoring an even lock count on * this mutex. In case we return here, we don't call into * __do_once_done but return early in the DO_ONCE_SLEEPABLE() macro. */ __acquire(once_mutex); return false; } return true; } EXPORT_SYMBOL(__do_once_sleepable_start); void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, struct module *mod) __releases(once_mutex) { *done = true; mutex_unlock(&once_mutex); once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_sleepable_done);
62 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-or-later /****************************************************************************** * vlanproc.c VLAN Module. /proc filesystem interface. * * This module is completely hardware-independent and provides * access to the router using Linux /proc filesystem. * * Author: Ben Greear, <greearb@candelatech.com> coppied from wanproc.c * by: Gene Kozin <genek@compuserve.com> * * Copyright: (c) 1998 Ben Greear * * ============================================================================ * Jan 20, 1998 Ben Greear Initial Version *****************************************************************************/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "vlanproc.h" #include "vlan.h" /****** Function Prototypes *************************************************/ /* Methods for preparing data for reading proc entries */ static int vlan_seq_show(struct seq_file *seq, void *v); static void *vlan_seq_start(struct seq_file *seq, loff_t *pos); static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos); static void vlan_seq_stop(struct seq_file *seq, void *); static int vlandev_seq_show(struct seq_file *seq, void *v); /* * Global Data */ /* * Names of the proc directory entries */ static const char name_root[] = "vlan"; static const char name_conf[] = "config"; /* * Structures for interfacing with the /proc filesystem. * VLAN creates its own directory /proc/net/vlan with the following * entries: * config device status/configuration * <device> entry for each device */ /* * Generic /proc/net/vlan/<file> file and inode operations */ static const struct seq_operations vlan_seq_ops = { .start = vlan_seq_start, .next = vlan_seq_next, .stop = vlan_seq_stop, .show = vlan_seq_show, }; /* * Proc filesystem directory entries. */ /* Strings */ static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", }; /* * Interface functions */ /* * Clean up /proc/net/vlan entries */ void vlan_proc_cleanup(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); if (vn->proc_vlan_conf) remove_proc_entry(name_conf, vn->proc_vlan_dir); if (vn->proc_vlan_dir) remove_proc_entry(name_root, net->proc_net); /* Dynamically added entries should be cleaned up as their vlan_device * is removed, so we should not have to take care of it here... */ } /* * Create /proc/net/vlan entries */ int __net_init vlan_proc_init(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net); if (!vn->proc_vlan_dir) goto err; vn->proc_vlan_conf = proc_create_net(name_conf, S_IFREG | 0600, vn->proc_vlan_dir, &vlan_seq_ops, sizeof(struct seq_net_private)); if (!vn->proc_vlan_conf) goto err; return 0; err: pr_err("can't create entry in proc filesystem!\n"); vlan_proc_cleanup(net); return -ENOBUFS; } /* * Add directory entry for VLAN device. */ int vlan_proc_add_dev(struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); if (!strcmp(vlandev->name, name_conf)) return -EINVAL; vlan->dent = proc_create_single_data(vlandev->name, S_IFREG | 0600, vn->proc_vlan_dir, vlandev_seq_show, vlandev); if (!vlan->dent) return -ENOBUFS; return 0; } /* * Delete directory entry for VLAN device. */ void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; } /****** Proc filesystem entry points ****************************************/ /* * The following few functions build the content of /proc/net/vlan/config */ static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos) { unsigned long ifindex = *pos; struct net_device *dev; for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { if (!is_vlan_dev(dev)) continue; *pos = dev->ifindex; return dev; } return NULL; } static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; return vlan_seq_from_index(seq, pos); } static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return vlan_seq_from_index(seq, pos); } static void vlan_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } static int vlan_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); struct vlan_net *vn = net_generic(net, vlan_net_id); if (v == SEQ_START_TOKEN) { const char *nmtype = NULL; seq_puts(seq, "VLAN Dev name | VLAN ID\n"); if (vn->name_type < ARRAY_SIZE(vlan_name_type_str)) nmtype = vlan_name_type_str[vn->name_type]; seq_printf(seq, "Name-Type: %s\n", nmtype ? nmtype : "UNKNOWN"); } else { const struct net_device *vlandev = v; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); seq_printf(seq, "%-15s| %d | %s\n", vlandev->name, vlan->vlan_id, vlan->real_dev->name); } return 0; } static int vlandev_seq_show(struct seq_file *seq, void *offset) { struct net_device *vlandev = (struct net_device *) seq->private; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats; static const char fmt64[] = "%30s %12llu\n"; int i; if (!is_vlan_dev(vlandev)) return 0; stats = dev_get_stats(vlandev, &temp); seq_printf(seq, "%s VID: %d REORDER_HDR: %i dev->priv_flags: %x\n", vlandev->name, vlan->vlan_id, (int)(vlan->flags & 1), (u32)vlandev->priv_flags); seq_printf(seq, fmt64, "total frames received", stats->rx_packets); seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes); seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast); seq_puts(seq, "\n"); seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets); seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes); seq_printf(seq, "Device: %s", vlan->real_dev->name); /* now show all PRIORITY mappings relating to this VLAN */ seq_printf(seq, "\nINGRESS priority mappings: " "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n", vlan->ingress_priority_map[0], vlan->ingress_priority_map[1], vlan->ingress_priority_map[2], vlan->ingress_priority_map[3], vlan->ingress_priority_map[4], vlan->ingress_priority_map[5], vlan->ingress_priority_map[6], vlan->ingress_priority_map[7]); seq_printf(seq, " EGRESS priority mappings: "); for (i = 0; i < 16; i++) { const struct vlan_priority_tci_mapping *mp = vlan->egress_priority_map[i]; while (mp) { seq_printf(seq, "%u:%d ", mp->priority, ((mp->vlan_qos >> 13) & 0x7)); mp = mp->next; } } seq_puts(seq, "\n"); return 0; }
1 1 2 2 2 2 1 1 1 1 1 2 2 2 11 11 11 1 1 28 28 28 1 1 1 1 1 7 7 7 7 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018, 2021-2024 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "core.h" #include "trace.h" static inline int rdev_suspend(struct cfg80211_registered_device *rdev, struct cfg80211_wowlan *wowlan) { int ret; trace_rdev_suspend(&rdev->wiphy, wowlan); ret = rdev->ops->suspend(&rdev->wiphy, wowlan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_resume(struct cfg80211_registered_device *rdev) { int ret; trace_rdev_resume(&rdev->wiphy); ret = rdev->ops->resume(&rdev->wiphy); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev, bool enabled) { trace_rdev_set_wakeup(&rdev->wiphy, enabled); rdev->ops->set_wakeup(&rdev->wiphy, enabled); trace_rdev_return_void(&rdev->wiphy); } static inline struct wireless_dev *rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct wireless_dev *ret; trace_rdev_add_virtual_intf(&rdev->wiphy, name, type); ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type, type, params); trace_rdev_return_wdev(&rdev->wiphy, ret); return ret; } static inline int rdev_del_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_del_virtual_intf(&rdev->wiphy, wdev); ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { int ret; trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type); ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int ret; trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params->mode); ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { int ret; trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, cookie, callback); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { int ret; trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast) { int ret; trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_settings *settings) { int ret; trace_rdev_start_ap(&rdev->wiphy, dev, settings); ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_update *info) { int ret; trace_rdev_change_beacon(&rdev->wiphy, dev, info); ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { int ret; trace_rdev_stop_ap(&rdev->wiphy, dev, link_id); ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_add_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct station_del_parameters *params) { int ret; trace_rdev_del_station(&rdev->wiphy, dev, params); ret = rdev->ops->del_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_change_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_station(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_get_station(&rdev->wiphy, dev, mac); ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_dump_station(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac); ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst) { int ret; trace_rdev_del_mpath(&rdev->wiphy, dev, dst); ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp); ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop); ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp); ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_config *conf) { int ret; trace_rdev_get_mesh_config(&rdev->wiphy, dev); ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf); trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf); return ret; } static inline int rdev_update_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { int ret; trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf); ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { int ret; trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup); ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_mesh(&rdev->wiphy, dev); ret = rdev->ops->leave_mesh(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup) { int ret; trace_rdev_join_ocb(&rdev->wiphy, dev, setup); ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ocb(&rdev->wiphy, dev); ret = rdev->ops->leave_ocb(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_bss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct bss_parameters *params) { int ret; trace_rdev_change_bss(&rdev->wiphy, dev, params); ret = rdev->ops->change_bss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_inform_bss(struct cfg80211_registered_device *rdev, struct cfg80211_bss *bss, const struct cfg80211_bss_ies *ies, void *drv_data) { trace_rdev_inform_bss(&rdev->wiphy, bss); if (rdev->ops->inform_bss) rdev->ops->inform_bss(&rdev->wiphy, bss, ies, drv_data); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_txq_params *params) { int ret; trace_rdev_set_txq_params(&rdev->wiphy, dev, params); ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan) { int ret; trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan); ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_monitor_channel(&rdev->wiphy, dev, chandef); ret = rdev->ops->set_monitor_channel(&rdev->wiphy, dev, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_scan(struct cfg80211_registered_device *rdev, struct cfg80211_scan_request *request) { int ret; if (WARN_ON_ONCE(!request->n_ssids && request->ssids)) return -EINVAL; trace_rdev_scan(&rdev->wiphy, request); ret = rdev->ops->scan(&rdev->wiphy, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_abort_scan(&rdev->wiphy, wdev); rdev->ops->abort_scan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { int ret; trace_rdev_auth(&rdev->wiphy, dev, req); ret = rdev->ops->auth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { int ret; trace_rdev_assoc(&rdev->wiphy, dev, req); ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_deauth_request *req) { int ret; trace_rdev_deauth(&rdev->wiphy, dev, req); ret = rdev->ops->deauth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_disassoc_request *req) { int ret; trace_rdev_disassoc(&rdev->wiphy, dev, req); ret = rdev->ops->disassoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme) { int ret; trace_rdev_connect(&rdev->wiphy, dev, sme); ret = rdev->ops->connect(&rdev->wiphy, dev, sme); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_connect_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme, u32 changed) { int ret; trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed); ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason_code) { int ret; trace_rdev_disconnect(&rdev->wiphy, dev, reason_code); ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params) { int ret; trace_rdev_join_ibss(&rdev->wiphy, dev, params); ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ibss(&rdev->wiphy, dev); ret = rdev->ops->leave_ibss(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) { int ret = -EOPNOTSUPP; trace_rdev_set_wiphy_params(&rdev->wiphy, changed); if (rdev->ops->set_wiphy_params) ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { int ret; trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm); ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int *dbm) { int ret; trace_rdev_get_tx_power(&rdev->wiphy, wdev); ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } static inline int rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, struct net_device *dev, const bool enabled) { int ret; trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled); ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_txq_stats(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { int ret; trace_rdev_get_txq_stats(&rdev->wiphy, wdev); ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) { trace_rdev_rfkill_poll(&rdev->wiphy); rdev->ops->rfkill_poll(&rdev->wiphy); trace_rdev_return_void(&rdev->wiphy); } #ifdef CONFIG_NL80211_TESTMODE static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, void *data, int len) { int ret; trace_rdev_testmode_cmd(&rdev->wiphy, wdev); ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { int ret; trace_rdev_testmode_dump(&rdev->wiphy); ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } #endif static inline int rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask) { int ret; trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev, struct net_device *netdev, int idx, struct survey_info *info) { int ret; trace_rdev_dump_survey(&rdev->wiphy, netdev, idx); ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info); if (ret < 0) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info); return ret; } static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev) { int ret; trace_rdev_flush_pmksa(&rdev->wiphy, netdev); ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { int ret; trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration); ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan, duration, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { int ret; trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params); ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, const bool noencrypt, int link, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link, cookie); if (cookie) trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); else trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, bool enabled, int timeout) { int ret; trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { int ret; trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 low, s32 high) { int ret; trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 rate, u32 pkts, u32 intvl) { int ret; trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { might_sleep(); trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); if (rdev->ops->update_mgmt_frame_registrations) rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, u32 tx_ant, u32 rx_ant) { int ret; trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant); ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, u32 *tx_ant, u32 *rx_ant) { int ret; trace_rdev_get_antenna(&rdev->wiphy); ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant, *rx_ant); return ret; } static inline int rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_sched_scan_request *request) { int ret; trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid); ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev, struct net_device *dev, u64 reqid) { int ret; trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid); ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { int ret; trace_rdev_set_rekey_data(&rdev->wiphy, dev); ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, int link_id, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *buf, size_t len) { int ret; trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, enum nl80211_tdls_operation oper) { int ret; trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper); ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_client(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u64 *cookie) { int ret; trace_rdev_probe_client(&rdev->wiphy, dev, peer); ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 noack_map) { int ret; trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map); ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_get_channel(&rdev->wiphy, wdev, link_id); ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef); trace_rdev_return_chandef(&rdev->wiphy, ret, chandef); return ret; } static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_start_p2p_device(&rdev->wiphy, wdev); ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_p2p_device(&rdev->wiphy, wdev); rdev->ops->stop_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { int ret; trace_rdev_start_nan(&rdev->wiphy, wdev, conf); ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_nan(&rdev->wiphy, wdev); rdev->ops->stop_nan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { int ret; trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func); ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie); rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_nan_change_conf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { int ret; trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes); if (rdev->ops->nan_change_conf) ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, changes); else ret = -EOPNOTSUPP; trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) { int ret; trace_rdev_set_mac_acl(&rdev->wiphy, dev, params); ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie) { int ret; trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie); ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_crit_proto_id protocol, u16 duration) { int ret; trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration); ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev, protocol, duration); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_crit_proto_stop(&rdev->wiphy, wdev); rdev->ops->crit_proto_stop(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_csa_settings *params) { int ret; trace_rdev_channel_switch(&rdev->wiphy, dev, params); ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_qos_map *qos_map) { int ret = -EOPNOTSUPP; if (rdev->ops->set_qos_map) { trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map); ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map); trace_rdev_return_int(&rdev->wiphy, ret); } return ret; } static inline int rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time) { int ret = -EOPNOTSUPP; trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); if (rdev->ops->add_tx_ts) ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer) { int ret = -EOPNOTSUPP; trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); if (rdev->ops->del_tx_ts) ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr) { trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_radar_detection(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id) { int ret = -EOPNOTSUPP; trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); if (rdev->ops->start_radar_detection) ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_end_cac(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { trace_rdev_end_cac(&rdev->wiphy, dev, link_id); if (rdev->ops->end_cac) rdev->ops->end_cac(&rdev->wiphy, dev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, int mcast_rate[NUM_NL80211_BANDS]) { int ret = -EOPNOTSUPP; trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate); if (rdev->ops->set_mcast_rate) ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_coalesce(struct cfg80211_registered_device *rdev, struct cfg80211_coalesce *coalesce) { int ret = -EOPNOTSUPP; trace_rdev_set_coalesce(&rdev->wiphy, coalesce); if (rdev->ops->set_coalesce) ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_pmk_conf *pmk_conf) { int ret = -EOPNOTSUPP; trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf); if (rdev->ops->set_pmk) ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *aa) { int ret = -EOPNOTSUPP; trace_rdev_del_pmk(&rdev->wiphy, dev, aa); if (rdev->ops->del_pmk) ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_external_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_external_auth_params *params) { int ret = -EOPNOTSUPP; trace_rdev_external_auth(&rdev->wiphy, dev, params); if (rdev->ops->external_auth) ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { int ret = -EOPNOTSUPP; trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); if (rdev->ops->get_ftm_responder_stats) ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->start_pmsr) ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->abort_pmsr) rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_owe_info *oweinfo) { int ret = -EOPNOTSUPP; trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo); if (rdev->ops->update_owe_info) ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *dest, const void *buf, size_t len) { int ret; trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len); ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { int ret; trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf); ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u8 tids) { int ret; trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids); ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev, struct cfg80211_sar_specs *sar) { int ret; trace_rdev_set_sar_specs(&rdev->wiphy, sar); ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_color_change(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_color_change_settings *params) { int ret; trace_rdev_color_change(&rdev->wiphy, dev, params); ret = rdev->ops->color_change(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_fils_aad(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_fils_aad *fils_aad) { int ret = -EOPNOTSUPP; trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad); if (rdev->ops->set_fils_aad) ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_radar_background(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_radar_background(wiphy, chandef); if (rdev->ops->set_radar_background) ret = rdev->ops->set_radar_background(wiphy, chandef); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_add_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { int ret = 0; trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->add_intf_link) ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->del_intf_link) rdev->ops->del_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_add_link_station(&rdev->wiphy, dev, params); if (rdev->ops->add_link_station) ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mod_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_mod_link_station(&rdev->wiphy, dev, params); if (rdev->ops->mod_link_station) ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_del_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_del_link_station(&rdev->wiphy, dev, params); if (rdev->ops->del_link_station) ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_hw_timestamp(wiphy, dev, hwts); if (rdev->ops->set_hw_timestamp) ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_set_ttlm(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ttlm_params *params) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_ttlm(wiphy, dev, params); if (rdev->ops->set_ttlm) ret = rdev->ops->set_ttlm(wiphy, dev, params); trace_rdev_return_int(wiphy, ret); return ret; } static inline u32 rdev_get_radio_mask(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wiphy *wiphy = &rdev->wiphy; if (!rdev->ops->get_radio_mask) return 0; return rdev->ops->get_radio_mask(wiphy, dev); } #endif /* __CFG80211_RDEV_OPS */
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/async.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/dirent.h> #include <linux/syscalls.h> #include <linux/utime.h> #include <linux/file.h> #include <linux/kstrtox.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/namei.h> #include <linux/init_syscalls.h> #include <linux/umh.h> #include <linux/security.h> #include "do_mounts.h" static __initdata bool csum_present; static __initdata u32 io_csum; static ssize_t __init xwrite(struct file *file, const unsigned char *p, size_t count, loff_t *pos) { ssize_t out = 0; /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */ while (count) { ssize_t rv = kernel_write(file, p, count, pos); if (rv < 0) { if (rv == -EINTR || rv == -EAGAIN) continue; return out ? out : rv; } else if (rv == 0) break; if (csum_present) { ssize_t i; for (i = 0; i < rv; i++) io_csum += p[i]; } p += rv; out += rv; count -= rv; } return out; } static __initdata char *message; static void __init error(char *x) { if (!message) message = x; } #define panic_show_mem(fmt, ...) \ ({ show_mem(); panic(fmt, ##__VA_ARGS__); }) /* link hash */ #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) static __initdata struct hash { int ino, minor, major; umode_t mode; struct hash *next; char name[N_ALIGN(PATH_MAX)]; } *head[32]; static inline int hash(int major, int minor, int ino) { unsigned long tmp = ino + minor + (major << 3); tmp += tmp >> 5; return tmp & 31; } static char __init *find_link(int major, int minor, int ino, umode_t mode, char *name) { struct hash **p, *q; for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { if ((*p)->ino != ino) continue; if ((*p)->minor != minor) continue; if ((*p)->major != major) continue; if (((*p)->mode ^ mode) & S_IFMT) continue; return (*p)->name; } q = kmalloc(sizeof(struct hash), GFP_KERNEL); if (!q) panic_show_mem("can't allocate link hash entry"); q->major = major; q->minor = minor; q->ino = ino; q->mode = mode; strcpy(q->name, name); q->next = NULL; *p = q; return NULL; } static void __init free_hash(void) { struct hash **p, *q; for (p = head; p < head + 32; p++) { while (*p) { q = *p; *p = q->next; kfree(q); } } } #ifdef CONFIG_INITRAMFS_PRESERVE_MTIME static void __init do_utime(char *filename, time64_t mtime) { struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; init_utimes(filename, t); } static void __init do_utime_path(const struct path *path, time64_t mtime) { struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; vfs_utimes(path, t); } static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; time64_t mtime; char name[]; }; static void __init dir_add(const char *name, time64_t mtime) { size_t nlen = strlen(name) + 1; struct dir_entry *de; de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); strscpy(de->name, name, nlen); de->mtime = mtime; list_add(&de->list, &dir_list); } static void __init dir_utime(void) { struct dir_entry *de, *tmp; list_for_each_entry_safe(de, tmp, &dir_list, list) { list_del(&de->list); do_utime(de->name, de->mtime); kfree(de); } } #else static void __init do_utime(char *filename, time64_t mtime) {} static void __init do_utime_path(const struct path *path, time64_t mtime) {} static void __init dir_add(const char *name, time64_t mtime) {} static void __init dir_utime(void) {} #endif static __initdata time64_t mtime; /* cpio header parsing */ static __initdata unsigned long ino, major, minor, nlink; static __initdata umode_t mode; static __initdata unsigned long body_len, name_len; static __initdata uid_t uid; static __initdata gid_t gid; static __initdata unsigned rdev; static __initdata u32 hdr_csum; static void __init parse_header(char *s) { unsigned long parsed[13]; char buf[9]; int i; buf[8] = '\0'; for (i = 0, s += 6; i < 13; i++, s += 8) { memcpy(buf, s, 8); parsed[i] = simple_strtoul(buf, NULL, 16); } ino = parsed[0]; mode = parsed[1]; uid = parsed[2]; gid = parsed[3]; nlink = parsed[4]; mtime = parsed[5]; /* breaks in y2106 */ body_len = parsed[6]; major = parsed[7]; minor = parsed[8]; rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); name_len = parsed[11]; hdr_csum = parsed[12]; } /* FSM */ static __initdata enum state { Start, Collect, GotHeader, SkipIt, GotName, CopyFile, GotSymlink, Reset } state, next_state; static __initdata char *victim; static unsigned long byte_count __initdata; static __initdata loff_t this_header, next_header; static inline void __init eat(unsigned n) { victim += n; this_header += n; byte_count -= n; } static __initdata char *collected; static long remains __initdata; static __initdata char *collect; static void __init read_into(char *buf, unsigned size, enum state next) { if (byte_count >= size) { collected = victim; eat(size); state = next; } else { collect = collected = buf; remains = size; next_state = next; state = Collect; } } static __initdata char *header_buf, *symlink_buf, *name_buf; static int __init do_start(void) { read_into(header_buf, 110, GotHeader); return 0; } static int __init do_collect(void) { unsigned long n = remains; if (byte_count < n) n = byte_count; memcpy(collect, victim, n); eat(n); collect += n; if ((remains -= n) != 0) return 1; state = next_state; return 0; } static int __init do_header(void) { if (!memcmp(collected, "070701", 6)) { csum_present = false; } else if (!memcmp(collected, "070702", 6)) { csum_present = true; } else { if (memcmp(collected, "070707", 6) == 0) error("incorrect cpio method used: use -H newc option"); else error("no cpio magic"); return 1; } parse_header(collected); next_header = this_header + N_ALIGN(name_len) + body_len; next_header = (next_header + 3) & ~3; state = SkipIt; if (name_len <= 0 || name_len > PATH_MAX) return 0; if (S_ISLNK(mode)) { if (body_len > PATH_MAX) return 0; collect = collected = symlink_buf; remains = N_ALIGN(name_len) + body_len; next_state = GotSymlink; state = Collect; return 0; } if (S_ISREG(mode) || !body_len) read_into(name_buf, N_ALIGN(name_len), GotName); return 0; } static int __init do_skip(void) { if (this_header + byte_count < next_header) { eat(byte_count); return 1; } else { eat(next_header - this_header); state = next_state; return 0; } } static int __init do_reset(void) { while (byte_count && *victim == '\0') eat(1); if (byte_count && (this_header & 3)) error("broken padding"); return 1; } static void __init clean_path(char *path, umode_t fmode) { struct kstat st; if (!init_stat(path, &st, AT_SYMLINK_NOFOLLOW) && (st.mode ^ fmode) & S_IFMT) { if (S_ISDIR(st.mode)) init_rmdir(path); else init_unlink(path); } } static int __init maybe_link(void) { if (nlink >= 2) { char *old = find_link(major, minor, ino, mode, collected); if (old) { clean_path(collected, 0); return (init_link(old, collected) < 0) ? -1 : 1; } } return 0; } static __initdata struct file *wfile; static __initdata loff_t wfile_pos; static int __init do_name(void) { state = SkipIt; next_state = Reset; /* name_len > 0 && name_len <= PATH_MAX checked in do_header */ if (collected[name_len - 1] != '\0') { pr_err("initramfs name without nulterm: %.*s\n", (int)name_len, collected); error("malformed archive"); return 1; } if (strcmp(collected, "TRAILER!!!") == 0) { free_hash(); return 0; } clean_path(collected, mode); if (S_ISREG(mode)) { int ml = maybe_link(); if (ml >= 0) { int openflags = O_WRONLY|O_CREAT|O_LARGEFILE; if (ml != 1) openflags |= O_TRUNC; wfile = filp_open(collected, openflags, mode); if (IS_ERR(wfile)) return 0; wfile_pos = 0; io_csum = 0; vfs_fchown(wfile, uid, gid); vfs_fchmod(wfile, mode); if (body_len) vfs_truncate(&wfile->f_path, body_len); state = CopyFile; } } else if (S_ISDIR(mode)) { init_mkdir(collected, mode); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); dir_add(collected, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { init_mknod(collected, mode, rdev); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); do_utime(collected, mtime); } } return 0; } static int __init do_copy(void) { if (byte_count >= body_len) { if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) error("write error"); do_utime_path(&wfile->f_path, mtime); fput(wfile); if (csum_present && io_csum != hdr_csum) error("bad data checksum"); eat(body_len); state = SkipIt; return 0; } else { if (xwrite(wfile, victim, byte_count, &wfile_pos) != byte_count) error("write error"); body_len -= byte_count; eat(byte_count); return 1; } } static int __init do_symlink(void) { if (collected[name_len - 1] != '\0') { pr_err("initramfs symlink without nulterm: %.*s\n", (int)name_len, collected); error("malformed archive"); return 1; } collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); init_symlink(collected + N_ALIGN(name_len), collected); init_chown(collected, uid, gid, AT_SYMLINK_NOFOLLOW); do_utime(collected, mtime); state = SkipIt; next_state = Reset; return 0; } static __initdata int (*actions[])(void) = { [Start] = do_start, [Collect] = do_collect, [GotHeader] = do_header, [SkipIt] = do_skip, [GotName] = do_name, [CopyFile] = do_copy, [GotSymlink] = do_symlink, [Reset] = do_reset, }; static long __init write_buffer(char *buf, unsigned long len) { byte_count = len; victim = buf; while (!actions[state]()) ; return len - byte_count; } static long __init flush_buffer(void *bufv, unsigned long len) { char *buf = bufv; long written; long origLen = len; if (message) return -1; while ((written = write_buffer(buf, len)) < len && !message) { char c = buf[written]; if (c == '0') { buf += written; len -= written; state = Start; } else if (c == 0) { buf += written; len -= written; state = Reset; } else error("junk within compressed archive"); } return origLen; } static unsigned long my_inptr __initdata; /* index of next byte to be processed in inbuf */ #include <linux/decompress/generic.h> static char * __init unpack_to_rootfs(char *buf, unsigned long len) { long written; decompress_fn decompress; const char *compress_name; static __initdata char msg_buf[64]; header_buf = kmalloc(110, GFP_KERNEL); symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); if (!header_buf || !symlink_buf || !name_buf) panic_show_mem("can't allocate buffers"); state = Start; this_header = 0; message = NULL; while (!message && len) { loff_t saved_offset = this_header; if (*buf == '0' && !(this_header & 3)) { state = Start; written = write_buffer(buf, len); buf += written; len -= written; continue; } if (!*buf) { buf++; len--; this_header++; continue; } this_header = 0; decompress = decompress_method(buf, len, &compress_name); pr_debug("Detected %s compressed data\n", compress_name); if (decompress) { int res = decompress(buf, len, NULL, flush_buffer, NULL, &my_inptr, error); if (res) error("decompressor failed"); } else if (compress_name) { if (!message) { snprintf(msg_buf, sizeof msg_buf, "compression method %s not configured", compress_name); message = msg_buf; } } else error("invalid magic at start of compressed archive"); if (state != Reset) error("junk at the end of compressed archive"); this_header = saved_offset + my_inptr; buf += my_inptr; len -= my_inptr; } dir_utime(); kfree(name_buf); kfree(symlink_buf); kfree(header_buf); return message; } static int __initdata do_retain_initrd; static int __init retain_initrd_param(char *str) { if (*str) return 0; do_retain_initrd = 1; return 1; } __setup("retain_initrd", retain_initrd_param); #ifdef CONFIG_ARCH_HAS_KEEPINITRD static int __init keepinitrd_setup(char *__unused) { do_retain_initrd = 1; return 1; } __setup("keepinitrd", keepinitrd_setup); #endif static bool __initdata initramfs_async = true; static int __init initramfs_async_setup(char *str) { return kstrtobool(str, &initramfs_async) == 0; } __setup("initramfs_async=", initramfs_async_setup); extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include <linux/initrd.h> #include <linux/kexec.h> static BIN_ATTR(initrd, 0440, sysfs_bin_attr_simple_read, NULL, 0); void __init reserve_initrd_mem(void) { phys_addr_t start; unsigned long size; /* Ignore the virtul address computed during device tree parsing */ initrd_start = initrd_end = 0; if (!phys_initrd_size) return; /* * Round the memory region to page boundaries as per free_initrd_mem() * This allows us to detect whether the pages overlapping the initrd * are in use, but more importantly, reserves the entire set of pages * as we don't want these pages allocated for other purposes. */ start = round_down(phys_initrd_start, PAGE_SIZE); size = phys_initrd_size + (phys_initrd_start - start); size = round_up(size, PAGE_SIZE); if (!memblock_is_region_memory(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region", (u64)start, size); goto disable; } if (memblock_is_region_reserved(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n", (u64)start, size); goto disable; } memblock_reserve(start, size); /* Now convert initrd to virtual addresses */ initrd_start = (unsigned long)__va(phys_initrd_start); initrd_end = initrd_start + phys_initrd_size; initrd_below_start_ok = 1; return; disable: pr_cont(" - disabling initrd\n"); initrd_start = 0; initrd_end = 0; } void __weak __init free_initrd_mem(unsigned long start, unsigned long end) { #ifdef CONFIG_ARCH_KEEP_MEMBLOCK unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); unsigned long aligned_end = ALIGN(end, PAGE_SIZE); memblock_free((void *)aligned_start, aligned_end - aligned_start); #endif free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, "initrd"); } #ifdef CONFIG_CRASH_RESERVE static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ if (initrd_start >= crashk_end || initrd_end <= crashk_start) return false; /* * Initialize initrd memory region since the kexec boot does not do. */ memset((void *)initrd_start, 0, initrd_end - initrd_start); if (initrd_start < crashk_start) free_initrd_mem(initrd_start, crashk_start); if (initrd_end > crashk_end) free_initrd_mem(crashk_end, initrd_end); return true; } #else static inline bool kexec_free_initrd(void) { return false; } #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_BLK_DEV_RAM static void __init populate_initrd_image(char *err) { ssize_t written; struct file *file; loff_t pos = 0; printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); file = filp_open("/initrd.image", O_WRONLY|O_CREAT|O_LARGEFILE, 0700); if (IS_ERR(file)) return; written = xwrite(file, (char *)initrd_start, initrd_end - initrd_start, &pos); if (written != initrd_end - initrd_start) pr_err("/initrd.image: incomplete write (%zd != %ld)\n", written, initrd_end - initrd_start); fput(file); } #endif /* CONFIG_BLK_DEV_RAM */ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) { /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) goto done; if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); else printk(KERN_INFO "Unpacking initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); if (err) { #ifdef CONFIG_BLK_DEV_RAM populate_initrd_image(err); #else printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); #endif } done: security_initramfs_populated(); /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) { free_initrd_mem(initrd_start, initrd_end); } else if (do_retain_initrd && initrd_start) { bin_attr_initrd.size = initrd_end - initrd_start; bin_attr_initrd.private = (void *)initrd_start; if (sysfs_create_bin_file(firmware_kobj, &bin_attr_initrd)) pr_err("Failed to create initrd sysfs file"); } initrd_start = 0; initrd_end = 0; init_flush_fput(); } static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain); static async_cookie_t initramfs_cookie; void wait_for_initramfs(void) { if (!initramfs_cookie) { /* * Something before rootfs_initcall wants to access * the filesystem/initramfs. Probably a bug. Make a * note, avoid deadlocking the machine, and let the * caller's access fail as it used to. */ pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n"); return; } async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain); } EXPORT_SYMBOL_GPL(wait_for_initramfs); static int __init populate_rootfs(void) { initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL, &initramfs_domain); usermodehelper_enable(); if (!initramfs_async) wait_for_initramfs(); return 0; } rootfs_initcall(populate_rootfs);
141 82 78 81 37 82 82 1 1 92 92 92 91 9 9 9 9 1 9 68 66 68 33 110 110 110 110 92 91 91 91 92 92 92 91 110 110 110 57 5 56 110 1 110 110 7 7 110 8 109 40 40 92 92 92 91 98 76 160 161 14 161 32 92 91 161 161 160 160 91 158 160 161 92 161 161 92 160 40 40 40 40 40 160 7 7 7 7 7 161 161 92 92 14 14 92 92 92 92 92 91 91 92 91 92 7 7 7 92 92 32 33 33 92 113 113 113 113 110 113 31 31 113 40 40 113 3 3 3 3 547 545 80 80 3 80 10 80 2 80 80 79 154 154 153 154 2 2 2 2 2 2 154 153 150 4 3 1 150 160 161 161 154 7 153 154 161 153 160 161 92 110 17 161 161 11 160 160 160 159 161 161 8 8 5 3 3 161 110 92 159 161 161 161 110 161 68 68 161 159 160 160 161 110 161 161 2 161 150 102 18 18 18 87 1 23 21 21 21 21 21 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 111 111 91 111 7 111 112 112 112 90 90 86 86 7 90 3 88 5 5 5 1 5 5 82 55 52 51 82 79 14 78 65 78 77 5 5 78 65 5 4 14 6 64 46 53 48 88 88 87 88 88 88 88 17 14 18 14 14 2 87 86 75 65 46 82 4 6 21 21 21 21 21 21 21 21 21 21 19 19 19 13 13 13 1 1 21 86 3 87 3 1 1 1 83 80 82 62 83 83 167 4 50 166 88 78 126 215 126 173 14 86 32 21 21 21 21 21 88 86 88 88 88 87 87 87 11 87 86 87 82 21 82 21 21 21 20 83 21 83 83 81 82 64 81 88 78 88 41 82 82 82 81 82 41 103 102 103 103 82 82 81 66 37 82 82 12 82 82 82 82 5 3 3 3 5 6 6 6 1 5 5 2 2 2 2 3 6 85 84 22 20 20 20 111 31 111 2 111 110 37 37 37 35 111 100 111 111 1 1 1 1 21 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 137 124 125 8 8 23 23 1 6 18 18 18 23 11 11 11 11 40 40 40 40 39 39 39 39 1 40 40 92 92 92 91 91 92 91 92 92 3 3 92 91 91 3 92 92 92 91 92 91 7 7 7 7 7 91 92 92 91 92 92 92 92 7 84 85 92 92 92 46 44 14 8 14 14 14 44 11 44 92 92 92 92 92 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> #include <linux/skbuff_ref.h> #include <trace/events/tcp.h> /* Refresh clocks of a TCP socket, * ensuring monotically increasing values. */ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); tp->tcp_clock_cache = val; tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); if (tp->highest_sack == NULL) tp->highest_sack = skb; tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt) || (tp->rx_opt.wscale_ok && ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst) { unsigned int metric = dst_metric_advmss(dst); if (metric < mss) { mss = metric; tp->advmss = mss; } } return (__u16)mss; } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, increase pingpong count. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) inet_csk_inc_pingpong_cnt(sk); } /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack); tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ if (window_clamp == 0) window_clamp = (U16_MAX << TCP_MAX_WSCALE); space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_SYMBOL(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */ static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 old_win = tp->rcv_wnd; u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we * are out of memory. The window is temporary, so we don't store * it on the socket. */ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) return 0; cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { /* Never shrink the offered window */ if (new_win == 0) NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* Make sure we do not exceed the maximum possible * scaled window. */ if (!tp->rx_opt.rcv_wscale && READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) { tp->pred_flags = 0; if (old_win) NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; } /* Packet ECN state for a SYN-ACK */ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } tp->ecn_flags = 0; if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } } static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); } static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { if (inet_rsk(req)->ecn_ok) th->ece = 1; } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); if (tp->ecn_flags & TCP_ECN_OK) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) th->ece = 1; } } /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } #define OPTION_SACK_ADVERTISE BIT(0) #define OPTION_TS BIT(1) #define OPTION_MD5 BIT(2) #define OPTION_WSCALE BIT(3) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (unlikely(OPTION_SMC & *options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); } } #endif } struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; }; static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } #ifdef CONFIG_CGROUP_BPF static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, enum tcp_synack_type synack_type) { if (unlikely(!skb)) return BPF_WRITE_HDR_TCP_CURRENT_MSS; if (unlikely(synack_type == TCP_SYNACK_COOKIE)) return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; return 0; } /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { struct bpf_sock_ops_kern sock_ops; int err; if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ /* init sock_ops */ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; if (req) { /* The listen "sk" cannot be passed here because * it is not locked. It would not make too much * sense to do bpf_setsockopt(listen_sk) based * on individual connection request also. * * Thus, "req" is passed here and the cgroup-bpf-progs * of the listen "sk" will be run. * * "req" is also used here for fastopen even the "sk" here is * a fullsock "child" sk. It is to keep the behavior * consistent between fastopen and non-fastopen on * the bpf programming side. */ sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = *remaining; /* tcp_current_mss() does not pass a skb */ if (skb) bpf_skops_init_skb(&sock_ops, skb, 0); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err || sock_ops.remaining_opt_len == *remaining) return; opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; /* round up to 4 bytes */ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; struct bpf_sock_ops_kern sock_ops; int err; if (likely(!max_opt_len)) return; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; if (req) { sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = max_opt_len; first_opt_off = tcp_hdrlen(skb) - max_opt_len; bpf_skops_init_skb(&sock_ops, skb, first_opt_off); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err) nr_written = 0; else nr_written = max_opt_len - sock_ops.remaining_opt_len; if (nr_written < max_opt_len) memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { } #endif static __be32 *process_tcp_ao_options(struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key, __be32 *ptr) { #ifdef CONFIG_TCP_AO u8 maclen = tcp_ao_maclen(key->ao_key); if (tcprsk) { u8 aolen = maclen + sizeof(struct tcp_ao_hdr); *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | (tcprsk->ao_keyid << 8) | (tcprsk->ao_rcv_next)); } else { struct tcp_ao_key *rnext_key; struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); rnext_key = READ_ONCE(ao_info->rnext_key); if (WARN_ON_ONCE(!rnext_key)) return ptr; *ptr++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (rnext_key->rcvid)); } opts->hash_location = (__u8 *)ptr; ptr += maclen / sizeof(*ptr); if (unlikely(maclen % sizeof(*ptr))) { memset(ptr, TCPOPT_NOP, sizeof(*ptr)); ptr++; } #endif return ptr; } /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). * * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key) { __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; } else if (tcp_key_is_ao(key)) { ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr); } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); } if (likely(OPTION_TS & options)) { if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); } *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); } if (unlikely(opts->num_sack_blocks)) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } tp->rx_opt.dsack = 0; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; u8 *p = (u8 *)ptr; u32 len; /* Fast Open option length */ if (foc->exp) { len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FASTOPEN_MAGIC); p += TCPOLEN_EXP_FASTOPEN_BASE; } else { len = TCPOLEN_FASTOPEN_BASE + foc->len; *p++ = TCPOPT_FASTOPEN; *p++ = len; } memcpy(p, foc->val, foc->len); if ((len & 3) == 2) { p[foc->len] = TCPOPT_NOP; p[foc->len + 1] = TCPOPT_NOP; } ptr += (len + 3) >> 2; } smc_options_write(ptr, &options); mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, const struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void mptcp_set_option_cond(const struct request_sock *req, struct tcp_out_options *opts, unsigned int *remaining) { if (rsk_is_mptcp(req)) { unsigned int size; if (mptcp_synack_options(req, &size, &opts->mptcp)) { if (*remaining >= size) { opts->options |= OPTION_MPTCP; *remaining -= size; } } } } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; bool timestamps; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { timestamps = false; opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; } else { timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps); if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); } } /* We always get an MSS option. The option bytes which will be seen in * normal data packets should timestamps be used, must be in the MSS * advertised. But we subtract them from tp->mss_cache so that * calculations in tcp_sendmsg are simpler etc. So account for this * fact here if necessary. If we don't do this correctly, as a * receiver we won't recognize data packets as being full sized when we * should, and thus we won't abide by the delayed ACK rules correctly. * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; if (likely(timestamps)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } smc_set_option(tp, opts, &remaining); if (sk_is_mptcp(sk)) { unsigned int size; if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { opts->options |= OPTION_MPTCP; remaining -= size; } } bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ static unsigned int tcp_synack_options(const struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_key *key, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; /* We can't fit any SACK blocks in a packet with MD5 + TS * options. There was discussion about disabling SACK * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ if (synack_type != TCP_SYNACK_COOKIE) ireq->tstamp_ok &= !ireq->sack_ok; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); ireq->tstamp_ok &= !ireq->sack_ok; } /* We always send an MSS option. */ opts->mss = mss; remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (foc != NULL && foc->len >= 0) { u32 need = foc->len; need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = foc; remaining -= need; } } mptcp_set_option_cond(req, opts, &remaining); smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; opts->options = 0; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; size += TCPOLEN_MD5SIG_ALIGNED; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; size += tcp_ao_len_aligned(key->ao_key); } if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } /* MPTCP options have precedence over SACK for the limited TCP * option space because a MPTCP connection would be forced to * fall back to regular TCP if a required multipath option is * missing. SACK still gets a chance to use whatever space is * left. */ if (sk_is_mptcp(sk)) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; unsigned int opt_size = 0; if (mptcp_established_options(sk, skb, &opt_size, remaining, &opts->mptcp)) { opts->options |= OPTION_MPTCP; size += opt_size; } } eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + TCPOLEN_SACK_PERBLOCK)) return size; opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); size = MAX_TCP_OPTION_SPACE - remaining; } return size; } /* TCP SMALL QUEUES (TSQ) * * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) * to reduce RTT and bufferbloat. * We do this using a special skb destructor (tcp_wfree). * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. * We use one tasklet per cpu, with its own queue of sockets. */ struct tsq_tasklet { struct tasklet_struct tasklet; struct list_head head; /* queue of tcp sockets */ }; static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); } } static void tcp_tsq_handler(struct sock *sk) { bh_lock_sock(sk); if (!sock_owned_by_user(sk)) tcp_tsq_write(sk); else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); bh_unlock_sock(sk); } /* * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(struct tasklet_struct *t) { struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; struct tcp_sock *tp; struct sock *sk; local_irq_save(flags); list_splice_init(&tsq->head, &list); local_irq_restore(flags); list_for_each_safe(q, n, &list) { tp = list_entry(q, struct tcp_sock, tsq_node); list_del(&tp->tsq_node); sk = (struct sock *)tp; smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); tcp_tsq_handler(sk); sk_free(sk); } } #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ TCPF_MTU_REDUCED_DEFERRED | \ TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket * * called from release_sock() to perform protocol dependent * actions before socket release. */ void tcp_release_cb(struct sock *sk) { unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags); unsigned long nflags; /* perform an atomic operation only if at least one flag is set */ do { if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags)); if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); __sock_put(sk); } if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } EXPORT_SYMBOL(tcp_release_cb); void __init tcp_tasklet_init(void) { int i; for_each_possible_cpu(i) { struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } /* * Write buffer destructor automatically called from kfree_skb. * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; struct tsq_tasklet *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tasklet_func() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : * - less callbacks to tcp_write_xmit(), reducing stress (batches) * - chance for incoming ACK (processed by another cpu maybe) * to migrate this flow (skb->ooo_okay will be eventually set) */ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; oval = smp_load_acquire(&sk->sk_tsq_flags); do { if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; out: sk_free(sk); } /* Note: Called under soft irq. * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; tcp_tsq_handler(sk); sock_put(sk); return HRTIMER_NORESTART; } static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, u64 prior_wstamp) { struct tcp_sock *tp = tcp_sk(sk); if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = READ_ONCE(sk->sk_pacing_rate); /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, * this is a minor annoyance. */ if (rate != ~0UL && rate && tp->data_segs_out >= 10) { u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); u64 credit = tp->tcp_wstamp_ns - prior_wstamp; /* take into account OS jitter */ len_ns -= min_t(u64, len_ns / 2, credit); tp->tcp_wstamp_ns += len_ns; } } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; struct tcp_key key; struct tcphdr *th; u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); if (clone_it) { oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) skb = pskb_copy(oskb, gfp_mask); else skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); if (unlikely(!skb)) return -ENOBUFS; /* retransmit skbs might have a non zero value in skb->dev * because skb->dev is aliased with skb->rbnode.rb_left */ skb->dev = NULL; } inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); tcp_get_current_key(sk, &key); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &key); } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &key); /* Force a PSH flag on all (GSO) packets to expedite GRO flush * at receiver : This slightly improve GRO performance. * Note that we do not force the PSH flag for non GSO packets, * because they might be sent under high congestion events, * and in this case it is better to delay the delivery of 1-MSS * packets and thus the corresponding ACK packet that would * release the following packet. */ if (tcp_skb_pcount(skb) > 1) tcb->tcp_flags |= TCPHDR_PSH; } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* We set skb->ooo_okay to one if this packet can select * a different TX queue than prior packets of this flow, * to avoid self inflicted reorders. * The 'other' queue decision is based on current cpu number * if XPS is enabled, or sk->sk_txhash otherwise. * We can switch to another (and better) queue if: * 1) No packet with payload is in qdisc/device queues. * Delays in TX completion can defeat the test * even if packets were already sent. * 2) Or rtx queue is empty. * This mitigates above case if ACK packets for * all prior packets were already processed. */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : * Other socket might not have SOCK_MEMALLOC. * Packets not looped back do not care about pfmemalloc. */ skb->pfmemalloc = 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF); th->urg = 1; } } skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } tcp_options_write(th, tp, NULL, &opts, &key); if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, sk, skb); #endif } else if (tcp_key_is_ao(&key)) { int err; err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); tcp_add_tx_delay(skb, tp); err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, inet6_csk_xmit, ip_queue_xmit, sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); err = net_xmit_eval(err); } if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; } static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); } /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } /* Initialize TSO segments for a packet. */ static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs; if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ TCP_SKB_CB(skb)->tcp_gso_size = 0; tcp_skb_pcount_set(skb, 1); return 1; } TCP_SKB_CB(skb)->tcp_gso_size = mss_now; tso_segs = DIV_ROUND_UP(skb->len, mss_now); tcp_skb_pcount_set(skb, tso_segs); return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); tp->packets_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= decr; /* Reno case is special. Sigh... */ if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); } static bool tcp_has_tx_tstamp(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->txstamp_ack || (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); } static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; TCP_SKB_CB(skb)->txstamp_ack = 0; } } static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) { TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; TCP_SKB_CB(skb)->eor = 0; } /* Insert buff after skb on the write or rtx queue of sk. */ static void tcp_insert_write_queue_after(struct sk_buff *skb, struct sk_buff *buff, struct sock *sk, enum tcp_queue tcp_queue) { if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) __skb_queue_after(&sk->sk_write_queue, skb, buff); else tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); } /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int old_factor; long limit; int nlen; u8 flags; if (WARN_ON(len > skb->len)) return -EINVAL; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Update delivered info for the new segment */ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; /* If this packet has been sent out already, we must * adjust the various packet counters. */ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); if (diff) tcp_adjust_pcount(sk, skb, diff); } /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } /* This is similar to __pskb_pull_tail(). The difference is that pulled * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } shinfo->nr_frags = k; skb->data_len -= len; skb->len = skb->data_len; return len; } /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } /* Calculate MSS not accounting any TCP options. */ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ mss_now = max(mss_now, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } /* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { /* Subtract TCP options size, not including SACKs */ return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } EXPORT_SYMBOL(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; } EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. It is minimum of user_mss and mss received with SYN. It also does not include TCP options. inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->rx_opt.mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; } EXPORT_SYMBOL(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ unsigned int tcp_current_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; unsigned int header_len; struct tcp_out_options opts; struct tcp_key key; mss_now = tp->mss_cache; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } tcp_get_current_key(sk, &key); header_len = tcp_established_options(sk, NULL, &opts, &key) + sizeof(struct tcphdr); /* The mss_cache is sized based on tp->tcp_header_len, which assumes * some common options. If this is an odd packet (because we have SACK * blocks etc) then our calculated header_len will be different, and * we have to adjust mss_now correspondingly */ if (header_len != tp->tcp_header_len) { int delta = (int) header_len - tp->tcp_header_len; mss_now -= delta; } return mss_now; } /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. */ static void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); /* Track the strongest available signal of the degree to which the cwnd * is fully utilized. If cwnd-limited then remember that fact for the * current window. If not cwnd-limited then track the maximum number of * outstanding packets in the current window. (If cwnd-limited then we * chose to not update tp->max_packets_out to avoid an extra else * clause with no functional impact.) */ if (!before(tp->snd_una, tp->cwnd_usage_seq) || is_cwnd_limited || (!tp->is_cwnd_limited && tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; tp->max_packets_out = tp->packets_out; tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); /* The following conditions together indicate the starvation * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } /* Minshall's variant of the Nagle send check. */ static bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } /* Update snd_sml if this skb is under mss * Note that a TSO packet might end with a sub-mss segment * The test is really : * if ((skb->len % mss) != 0) * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; * But we can avoid doing the divide again given we already have * skb_pcount = skb->len / mss_now */ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, const struct sk_buff *skb) { if (skb->len < tcp_skb_pcount(skb) * mss_now) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } /* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. (provided by caller in %partial bool) * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, int nonagle) { return partial && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return how many segs we'd like on a TSO packet, * depending on current pacing rate, and how close the peer is. * * Rationale is: * - For close peers, we rather send bigger packets to reduce * cpu costs, because occasional losses will be repaired fast. * - For long distance/rtt flows, we would like to get ACK clocking * with 1 ACK per ms. * * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting * in bigger TSO bursts. We we cut the RTT-based allowance in half * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { unsigned long bytes; u32 r; bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) bytes += sk->sk_gso_max_size >> r; bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. */ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 min_tso, tso_segs; min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int max_segs, int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) return max_len; needed = min(skb->len, window); if (max_len <= needed) return max_len; partial = needed % mss_now; /* If last segment is not a full MSS, check if Nagle rules allow us * to include this last segment in this skb. * Otherwise, we'll split the skb at last MSS boundary */ if (tcp_nagle_check(partial != 0, tp, nonagle)) return needed - partial; return needed; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; /* For better scheduling, ensure we have at least * 2 GSO packets in flight. */ halfcwnd = max(cwnd >> 1, 1U); return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) return tcp_set_skb_tso_segs(skb, mss_now); return tso_segs; } /* Return true if the Nagle test allows this packet to be * sent now. */ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). * * This is implemented in the callers, where they modify the 'nonagle' * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; } /* Does at least the first segment of SKB fit into the send window? */ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; return !after(end_seq, tcp_wnd_end(tp)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions * in order to speed up the splitting operation. In particular, we * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { int nlen = skb->len - len; struct sk_buff *buff; u8 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); return 0; } /* Try to defer sending, if possible, in order to minimize the amount * of TSO splitting we do. View it as a kind of TSO Nagle test. * * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, bool *is_rwnd_limited, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer * only if the last write was recent (1 ms). * Note that tp->tcp_wstamp_ns can be in the future if we have * packets waiting in a qdisc or device for EDT delivery. */ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { /* Different approach, try not to defer past a single * ACK. Receiver should ACK every other full sized * frame, so if we have space for more than 3 frames * then send now. */ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } /* TODO : use tsorted_sent_queue ? */ head = tcp_rtx_queue_head(sk); if (!head) goto send_now; delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; /* Ok, it looks like it is advisable to defer. * Three cases are tracked : * 1) We are cwnd-limited * 2) We are rwnd-limited * 3) We are application limited. */ if (cong_win < send_win) { if (cong_win <= skb->len) { *is_cwnd_limited = true; return true; } } else { if (send_win <= skb->len) { *is_rwnd_limited = true; return true; } } /* If this packet won't get more data, do not wait. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || TCP_SKB_CB(skb)->eor) goto send_now; return true; send_now: return false; } static inline void tcp_mtu_check_reprobe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 interval; s32 delta; interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); /* Update current search range */ icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) { struct sk_buff *skb, *next; skb = tcp_send_head(sk); tcp_for_write_queue_from_safe(skb, next, sk) { if (len <= skb->len) break; if (tcp_has_tx_tstamp(skb) || !tcp_skb_can_collapse(skb, next)) return false; len -= skb->len; } return true; } static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, int probe_size) { skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; int i, todo, len = 0, nr_frags = 0; const struct sk_buff *skb; if (!sk_wmem_schedule(sk, to->truesize + probe_size)) return -ENOMEM; skb_queue_walk(&sk->sk_write_queue, skb) { const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; if (skb_headlen(skb)) return -EINVAL; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { if (len >= probe_size) goto commit; todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + skb_frag_size(lastfrag)) { skb_frag_size_add(lastfrag, todo); continue; } if (unlikely(nr_frags == MAX_SKB_FRAGS)) return -E2BIG; skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); nr_frags++; lastfrag = fragto++; } } commit: WARN_ON_ONCE(len != probe_size); for (i = 0; i < nr_frags; i++) skb_frag_ref(to, i); skb_shinfo(to)->nr_frags = nr_frags; to->truesize += probe_size; to->len += probe_size; to->data_len += probe_size; __skb_header_release(to); return 0; } /* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if * all its payload was moved to another one (dst). * Make sure to transfer tcp_flags, eor, and tstamp. */ static void tcp_eat_one_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src) { TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; tcp_skb_collapse_tstamp(dst, src); tcp_unlink_write_queue(src, sk); tcp_wmem_free_skb(sk, src); } /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing * changes resulting in larger path MTUs. * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise */ static int tcp_mtu_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); int probe_size; int size_needed; int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and * not SACKing (the variable headers throw things off) */ if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, * and current mss_clamp. if (search_high - search_low) * smaller than a threshold, backoff from probing. */ mss_now = tcp_current_mss(sk); probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; /* When misfortune happens, we are reprobing actively, * and then reprobe timer has expired. We stick with current * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ tcp_mtu_check_reprobe(sk); return -1; } /* Have enough data in the send queue to probe? */ if (tp->write_seq - tp->snd_nxt < size_needed) return -1; if (tp->snd_wnd < size_needed) return -1; if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else return 0; } if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) return -1; /* We're allowed to probe. Build it now. */ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } len += copy; if (len >= probe_size) break; } tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; return 1; } return -1; } static bool tcp_pacing_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_needs_internal_pacing(sk)) return false; if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) return false; if (!hrtimer_is_queued(&tp->pacing_timer)) { hrtimer_start(&tp->pacing_timer, ns_to_ktime(tp->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } return true; } static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) { const struct rb_node *node = sk->tcp_rtx_queue.rb_node; /* No skb in the rtx queue. */ if (!node) return true; /* Only one skb in rtx queue. */ return !node->rb_left && !node->rb_right; } /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) * This allows for : * - better RTT estimation and ACK scheduling * - faster recovery * - high rates * Alas, some drivers / subsystems require a fair amount * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { unsigned long limit; limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && tcp_sk(sk)->tcp_tx_delay) { u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) * tcp_sk(sk)->tcp_tx_delay; /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we * approximate our needs assuming an ~100% skb->truesize overhead. * USEC_PER_SEC is approximated by 2^20. * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. */ extra_bytes >>= (20 - 1); limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. */ smp_mb__after_atomic(); if (refcount_read(&sk->sk_wmem_alloc) > limit) return true; } return false; } static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; enum tcp_chrono old = tp->chrono_type; if (old > TCP_CHRONO_UNSPEC) tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* If there are multiple conditions worthy of tracking in a * chronograph then the highest priority enum takes precedence * over the other conditions. So that if something "more interesting" * starts happening, stop the previous chrono and start a new one. */ if (type > tp->chrono_type) tcp_chrono_set(tp, type); } void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* There are multiple conditions worthy of tracking in a * chronograph, so that the highest priority enum takes * precedence over the other conditions (see tcp_chrono_start). * If a condition stops, we only stop chrono tracking if * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } /* First skb in the write queue is smaller than ideal packet size. * Check if we can move payload from the second skb in the queue. */ static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) { struct sk_buff *next_skb = skb->next; unsigned int nlen; if (tcp_skb_is_last(sk, skb)) return; if (!tcp_skb_can_collapse(skb, next_skb)) return; nlen = min_t(u32, amount, next_skb->len); if (!nlen || !skb_shift(skb, next_skb, nlen)) return; TCP_SKB_CB(skb)->end_seq += nlen; TCP_SKB_CB(next_skb)->seq += nlen; if (!next_skb->len) { /* In case FIN is set, we need to update end_seq */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; tcp_eat_one_skb(sk, skb, next_skb); } } /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * LARGESEND note: !tcp_urg_mode is overkill, only frames between * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * * Send at most one packet when push_one > 0. Temporarily ignore * cwnd limit to force at most one packet out when push_one == 2. * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; sent_pkts = 0; tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return false; } else if (result > 0) { sent_pkts = 1; } } max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } if (tcp_pacing_check(sk)) break; cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; else break; } cwnd_quota = min(cwnd_quota, max_segs); missing_bytes = cwnd_quota * mss_now - skb->len; if (missing_bytes > 0) tcp_grow_skb(sk, skb, missing_bytes); tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, &is_rwnd_limited, max_segs)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota, nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; if (tcp_small_queue_check(sk, skb, 0)) break; /* Argh, we hit an empty skb(), presumably a thread * is sleeping in sendmsg()/sk_stream_wait_memory(). * We do not want to send a pure-ack packet and have * a strange looking rtx queue with empty packet(s). */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (is_rwnd_limited) tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); if (likely(sent_pkts || is_cwnd_limited)) tcp_cwnd_validate(sk, is_cwnd_limited); if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || (icsk->icsk_ca_state != TCP_CA_Open && icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) timeout_us += tcp_rto_min_us(sk); else timeout_us += TCP_TIMEOUT_MIN_US; timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } /* If the RTO formula yields an earlier time, then use that time. */ rto_delta_us = advancing_rto ? jiffies_to_usecs(inet_csk(sk)->icsk_rto) : tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); return true; } /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); if (skb_fclone_busy(sk, skb)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } } return false; } /* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); /* At most one outstanding TLP */ if (tp->tlp_high_seq) goto rearm_timer; tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); if (tp->packets_out > pcount) goto probe_sent; goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { tcp_warn_once(sk, tp->packets_out, "invalid inflight: "); smp_store_release(&inet_csk(sk)->icsk_pending, 0); return; } if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; tp->tlp_retrans = 1; probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ smp_store_release(&inet_csk(sk)->icsk_pending, 0); rearm_timer: tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. */ if (unlikely(sk->sk_state == TCP_CLOSE)) return; if (tcp_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct sk_buff *skb = tcp_send_head(sk); BUG_ON(!skb || skb->len < mss_now); tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); } /* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); int full_space, window; if (sk_is_mptcp(sk)) mptcp_space(sk, &free_space, &allowed_space); full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; if (mss <= 0) return 0; } /* Only allow window shrink if the sysctl is enabled and we have * a non-zero scaling factor in effect. */ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) goto shrink_window_allowed; /* do not allow window to shrink */ if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); /* if free space is less than mss estimate, or is below 1/16th * of the maximum allowed, try to move to zero-window, else * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and * new incoming data is dropped due to memory limits. * With large window, mss test triggers way too late in order * to announce zero window in time before rmem limit kicks in. */ if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ if (tp->rx_opt.rcv_wscale) { window = free_space; /* Advertise enough space so that it won't get scaled away. * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the * free space we just keep it. This prevents the divide * and multiply from happening most of the time. * We also don't do any window rounding when the free space * is too small. */ if (window <= free_space - mss || window > free_space) window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; } return window; shrink_window_allowed: /* new window should always be an exact multiple of scaling factor */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* if free space is too low, return a zero window */ if (free_space < (allowed_space >> 4) || free_space < mss || free_space < (1 << tp->rx_opt.rcv_wscale)) return 0; } if (free_space > tp->rcv_ssthresh) { free_space = tp->rcv_ssthresh; /* new window should always be an exact multiple of scaling factor * * For this case, we ALIGN "up" (increase free_space) because * we know free_space is not zero here, it has been reduced from * the memory-based limit, and rcv_ssthresh is not a hard limit * (unlike sk_rcvbuf). */ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); } return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { if (unlikely(tcp_has_tx_tstamp(next_skb))) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; TCP_SKB_CB(skb)->txstamp_ack |= TCP_SKB_CB(next_skb)->txstamp_ack; } } /* Collapses two adjacent SKB's during retransmission. */ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); int next_skb_size; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); tcp_skb_collapse_tstamp(skb, next_skb); tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } /* Check if coalescing SKBs is legal. */ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; if (skb_cloned(skb)) return false; if (!skb_frags_readable(skb)) return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; return true; } /* Collapse packets in the retransmit queue to make to create * less packets on the wire. This is only done on retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; bool first = true; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; if (first) { first = false; continue; } if (space < 0) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; if (!tcp_collapse_retrans(sk, to)) break; } } /* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; if (skb_still_in_host_queue(sk, skb)) return -EBUSY; start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; TCP_SKB_CB(skb)->seq++; goto start; } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); return -EINVAL; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit of one segment serves as a zero window probe. */ if (avail_wnd <= 0) { if (TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; avail_wnd = cur_mss; } len = cur_mss * segs; if (len > avail_wnd) { len = rounddown(avail_wnd, cur_mss); if (!len) len = avail_wnd; } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); avail_wnd = min_t(int, avail_wnd, cur_mss); if (skb->len < avail_wnd) tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); if (nskb) { nskb->dev = NULL; err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); } else { err = -ENOBUFS; } } tcp_skb_tsorted_restore(skb); if (!err) { tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } /* To avoid taking spuriously low RTT samples based on a timestamp * for a transmit that never happened, always mark EVER_RETRANS */ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); } /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); return err; } /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); bool rearm_timer = false; u32 max_segs; int mib_idx; if (!tp->packets_out) return; rtx_head = tcp_rtx_queue_head(sk); skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); skb_rbtree_walk_from(skb) { __u8 sacked; int segs; if (tcp_pacing_check(sk)) break; /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ segs = min_t(int, segs, max_segs); if (tp->retrans_out >= tp->lost_out) { break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; if (tcp_small_queue_check(sk, skb, 1)) break; if (tcp_retransmit_skb(sk, skb, segs)) break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) rearm_timer = true; } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. * In general, we want to allow one skb per socket to avoid hangs * with edge trigger epoll() */ void sk_forced_mem_schedule(struct sock *sk, int size) { int delta, amt; delta = size - sk->sk_forward_alloc; if (delta <= 0) return; amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and * this skb was not yet sent, or we are under memory pressure. * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!skb)) return; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason) { struct sk_buff *skb; TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ trace_tcp_send_reset(sk, NULL, reason); } /* Send a crossed SYN-ACK during socket establishment. * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. */ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = skb_copy(skb, GFP_ATOMIC); } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /** * tcp_make_synack - Allocate one skb and build a SYNACK packet. * @sk: listener socket * @dst: dst entry attached to the SYNACK. It is consumed and caller * should not use it again. * @req: request_sock pointer * @foc: cookie for tcp fast open * @synack_type: Type of synack to prepare * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); struct tcp_out_options opts; struct tcp_key key = {}; struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; int mss; u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); switch (synack_type) { case TCP_SYNACK_NORMAL: skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, * to avoid false sharing. */ break; case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); break; } skb_dst_set(skb, dst); mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), SKB_CLOCK_MONOTONIC); else #endif { skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif if (tcp_rsk_used_ao(req)) { #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; u8 rnext = tcp_rsk(req)->ao_rcv_next; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); /* If there is no matching key - avoid sending anything, * especially usigned segments. It could try harder and lookup * for another peer-matching key, but the peer has requested * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { trace_tcp_ao_synack_no_key(sk, keyid, rnext); rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", keyid); return NULL; } key.ao_key = ao_key; key.type = TCP_KEY_AO; #endif } else { #ifdef CONFIG_TCP_MD5SIG key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &key, foc, synack_type, syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; skb->ip_summed = CHECKSUM_PARTIAL; th->seq = htonl(tcp_rsk(req)->snt_isn); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); /* Okay, we have all we need - do the md5 hash if needed */ if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, req_to_sk(req), skb); #endif } else if (tcp_key_is_ao(&key)) { #ifdef CONFIG_TCP_AO tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location, key.ao_key, req, skb, opts.hash_location - (u8 *)th, 0); #endif } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_unlock(); #endif bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); tcp_add_tx_delay(skb, tp); return skb; } EXPORT_SYMBOL(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); if (ca_key == TCP_CA_UNSPEC) return; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } rcu_read_unlock(); } /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; WRITE_ONCE(sk->sk_err, 0); sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, 0); tcp_write_queue_purge(sk); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; __skb_header_release(skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } /* Build and send a SYN with data and (cached) Fast Open cookie. However, * queue a data-only packet after the regular SYN, such that regular SYNs * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges * only the SYN sequence, the data are retransmitted in the first ACK. * If cookie is not cached or other error occurs, falls back to send a * regular SYN with Fast Open cookie request option. */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); /* Sync mss_cache after updating the mss_clamp */ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); if (space && !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), pfrag, sk->sk_allocation)) goto fallback; syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { space = min_t(size_t, space, pfrag->size - pfrag->offset); space = tcp_wmem_schedule(sk, space); } if (space) { space = copy_page_from_iter(pfrag->page, pfrag->offset, space, &fo->data->msg_iter); if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } skb_fill_page_desc(syn_data, 0, pfrag->page, pfrag->offset, space); page_ref_inc(pfrag->page); pfrag->offset += space; skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; fo->copied = space; tcp_connect_queue_skb(sk, syn_data); if (syn_data->len) tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) * we keep in write queue in case of a retransmit, as we * also have the SYN packet (with no data) in the same queue. */ TCP_SKB_CB(syn_data)->seq++; TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } /* data was not sent, put it in write_queue */ __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) fo->cookie.len = 0; err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; } /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); #if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO) /* Has to be checked late, after setting daddr/saddr/ops. * Return error if the peer has both a md5 and a tcp-ao key * configured as this is ambiguous. */ if (unlikely(rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)))) { bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1); bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk); struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(sk)); if (ao_info) { /* This is an extra check: tcp_ao_required() in * tcp_v{4,6}_parse_md5_keys() should prevent adding * md5 keys on ao_required socket. */ needs_ao |= ao_info->ao_required; WARN_ON_ONCE(ao_info->ao_required && needs_md5); } if (needs_md5 && needs_ao) return -EKEYREJECTED; /* If we have a matching md5 key and no matching tcp-ao key * then free up ao_info if allocated. */ if (needs_md5) { tcp_ao_destroy_sock(sk, false); } else if (needs_ao) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, lockdep_sock_is_held(sk))); } } #endif #ifdef CONFIG_TCP_AO if (unlikely(rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)))) { /* Don't allow connecting if ao is configured but no * matching key is found. */ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1)) return -EKEYREJECTED; } #endif if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ void tcp_send_delayed_ack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt_us) { int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; } ato = min(ato, max_ato); } ato = min_t(u32, ato, tcp_delack_max(sk)); /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } if (!time_before(timeout, icsk->icsk_ack.timeout)) timeout = icsk->icsk_ack.timeout; } smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; if (delay < TCP_RTO_MAX) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 */ skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); } EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); } /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. * * Question: what should we make while urgent mode? * 4.4BSD forces sending single byte of data. We cannot send * out of window data, because we have SND.NXT==SND.MAX... * * Current solution: to send TWO zero-length segments in urgent mode: * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (!skb) return -1; /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } /* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (sk->sk_state == TCP_CLOSE) return -1; skb = tcp_send_head(sk); if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; /* We are probing the opening of a window * but the window size is != 0 * must have been a result SWS avoidance ( sender ) */ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1, mib); return tcp_xmit_probe_skb(sk, 0, mib); } } /* A window probe timeout has occurred. If window is not closed send * a partial packet else a zero probe. */ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) { /* sk has const attribute because listeners are lockless. * However in this case, we are dealing with a passive fastopen * socket thus we can change total_retrans value. */ tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); } return res; } EXPORT_SYMBOL(tcp_rtx_synack);
59 59 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC remote transport endpoint record management * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include <net/route.h> #include <net/ip6_route.h> #include "ar-internal.h" static const struct sockaddr_rxrpc rxrpc_null_addr; /* * Hash a peer key. */ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { const u16 *p; unsigned int i, size; unsigned long hash_key; _enter(""); hash_key = (unsigned long)local / __alignof__(*local); hash_key += srx->transport_type; hash_key += srx->transport_len; hash_key += srx->transport.family; switch (srx->transport.family) { case AF_INET: hash_key += (u16 __force)srx->transport.sin.sin_port; size = sizeof(srx->transport.sin.sin_addr); p = (u16 *)&srx->transport.sin.sin_addr; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: hash_key += (u16 __force)srx->transport.sin.sin_port; size = sizeof(srx->transport.sin6.sin6_addr); p = (u16 *)&srx->transport.sin6.sin6_addr; break; #endif default: WARN(1, "AF_RXRPC: Unsupported transport address family\n"); return 0; } /* Step through the peer address in 16-bit portions for speed */ for (i = 0; i < size; i += sizeof(*p), p++) hash_key += *p; _leave(" 0x%lx", hash_key); return hash_key; } /* * Compare a peer to a key. Return -ve, 0 or +ve to indicate less than, same * or greater than. * * Unfortunately, the primitives in linux/hashtable.h don't allow for sorted * buckets and mid-bucket insertion, so we don't make full use of this * information at this point. */ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer, struct rxrpc_local *local, const struct sockaddr_rxrpc *srx, unsigned long hash_key) { long diff; diff = ((peer->hash_key - hash_key) ?: ((unsigned long)peer->local - (unsigned long)local) ?: (peer->srx.transport_type - srx->transport_type) ?: (peer->srx.transport_len - srx->transport_len) ?: (peer->srx.transport.family - srx->transport.family)); if (diff != 0) return diff; switch (srx->transport.family) { case AF_INET: return ((u16 __force)peer->srx.transport.sin.sin_port - (u16 __force)srx->transport.sin.sin_port) ?: memcmp(&peer->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: return ((u16 __force)peer->srx.transport.sin6.sin6_port - (u16 __force)srx->transport.sin6.sin6_port) ?: memcmp(&peer->srx.transport.sin6.sin6_addr, &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)); #endif default: BUG(); } } /* * Look up a remote transport endpoint for the specified address using RCU. */ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( struct rxrpc_local *local, const struct sockaddr_rxrpc *srx, unsigned long hash_key) { struct rxrpc_peer *peer; struct rxrpc_net *rxnet = local->rxnet; hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 && refcount_read(&peer->ref) > 0) return peer; } return NULL; } /* * Look up a remote transport endpoint for the specified address using RCU. */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { struct rxrpc_peer *peer; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer) _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref)); return peer; } /* * assess the MTU size for the network interface through which this peer is * reached */ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, struct rxrpc_peer *peer) { struct net *net = local->net; struct dst_entry *dst; struct rtable *rt; struct flowi fl; struct flowi4 *fl4 = &fl.u.ip4; #ifdef CONFIG_AF_RXRPC_IPV6 struct flowi6 *fl6 = &fl.u.ip6; #endif peer->if_mtu = 1500; memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { case AF_INET: rt = ip_route_output_ports( net, fl4, NULL, peer->srx.transport.sin.sin_addr.s_addr, 0, htons(7000), htons(7001), IPPROTO_UDP, 0, 0); if (IS_ERR(rt)) { _leave(" [route err %ld]", PTR_ERR(rt)); return; } dst = &rt->dst; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: fl6->flowi6_iif = LOOPBACK_IFINDEX; fl6->flowi6_scope = RT_SCOPE_UNIVERSE; fl6->flowi6_proto = IPPROTO_UDP; memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr, sizeof(struct in6_addr)); fl6->fl6_dport = htons(7001); fl6->fl6_sport = htons(7000); dst = ip6_route_output(net, NULL, fl6); if (dst->error) { _leave(" [route err %d]", dst->error); return; } break; #endif default: BUG(); } peer->if_mtu = dst_mtu(dst); dst_release(dst); _leave(" [if_mtu %u]", peer->if_mtu); } /* * Allocate a peer. */ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, enum rxrpc_peer_trace why) { struct rxrpc_peer *peer; _enter(""); peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { refcount_set(&peer->ref, 1); peer->local = rxrpc_get_local(local, rxrpc_local_get_peer); INIT_HLIST_HEAD(&peer->error_targets); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); rxrpc_peer_init_rtt(peer); peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; trace_rxrpc_peer(peer->debug_id, 1, why); } _leave(" = %p", peer); return peer; } /* * Initialise peer record. */ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; rxrpc_assess_MTU_size(local, peer); peer->mtu = peer->if_mtu; peer->rtt_last_req = ktime_get_real(); switch (peer->srx.transport.family) { case AF_INET: peer->hdrsize = sizeof(struct iphdr); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: peer->hdrsize = sizeof(struct ipv6hdr); break; #endif default: BUG(); } switch (peer->srx.transport_type) { case SOCK_DGRAM: peer->hdrsize += sizeof(struct udphdr); break; default: BUG(); } peer->hdrsize += sizeof(struct rxrpc_wire_header); peer->maxdata = peer->mtu - peer->hdrsize; } /* * Set up a new peer. */ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, unsigned long hash_key, gfp_t gfp) { struct rxrpc_peer *peer; _enter(""); peer = rxrpc_alloc_peer(local, gfp, rxrpc_peer_new_client); if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); rxrpc_init_peer(local, peer, hash_key); } _leave(" = %p", peer); return peer; } static void rxrpc_free_peer(struct rxrpc_peer *peer) { trace_rxrpc_peer(peer->debug_id, 0, rxrpc_peer_free); rxrpc_put_local(peer->local, rxrpc_local_put_peer); kfree_rcu(peer, rcu); } /* * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. */ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); rxrpc_init_peer(local, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); spin_unlock(&rxnet->peer_hash_lock); } /* * obtain a remote transport endpoint for the specified address */ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); _enter("{%pISp}", &srx->transport); /* search the peer list first */ rcu_read_lock(); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_lookup_client)) peer = NULL; rcu_read_unlock(); if (!peer) { /* The peer is not yet present in hash - create a candidate * for a new record and then redo the search. */ candidate = rxrpc_create_peer(local, srx, hash_key, gfp); if (!candidate) { _leave(" = NULL [nomem]"); return NULL; } spin_lock(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_lookup_client)) peer = NULL; if (!peer) { hash_add_rcu(rxnet->peer_hash, &candidate->hash_link, hash_key); list_add_tail(&candidate->keepalive_link, &rxnet->peer_keepalive_new); } spin_unlock(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); else peer = candidate; } _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref)); return peer; } /* * Get a ref on a peer record. */ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) { int r; __refcount_inc(&peer->ref, &r); trace_rxrpc_peer(peer->debug_id, r + 1, why); return peer; } /* * Get a ref on a peer record unless its usage has already reached 0. */ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) { int r; if (peer) { if (__refcount_inc_not_zero(&peer->ref, &r)) trace_rxrpc_peer(peer->debug_id, r + 1, why); else peer = NULL; } return peer; } /* * Discard a peer record. */ static void __rxrpc_put_peer(struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = peer->local->rxnet; ASSERT(hlist_empty(&peer->error_targets)); spin_lock(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); spin_unlock(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } /* * Drop a ref on a peer record. */ void rxrpc_put_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) { unsigned int debug_id; bool dead; int r; if (peer) { debug_id = peer->debug_id; dead = __refcount_dec_and_test(&peer->ref, &r); trace_rxrpc_peer(debug_id, r - 1, why); if (dead) __rxrpc_put_peer(peer); } } /* * Make sure all peer records have been discarded. */ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) { struct rxrpc_peer *peer; int i; for (i = 0; i < HASH_SIZE(rxnet->peer_hash); i++) { if (hlist_empty(&rxnet->peer_hash[i])) continue; hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { pr_err("Leaked peer %u {%u} %pISp\n", peer->debug_id, refcount_read(&peer->ref), &peer->srx.transport); } } } /** * rxrpc_kernel_get_call_peer - Get the peer address of a call * @sock: The socket on which the call is in progress. * @call: The call to query * * Get a record for the remote peer in a call. */ struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { return call->peer; } EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); /** * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @peer: The peer to query * * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); /** * rxrpc_kernel_remote_srx - Get the address of a peer * @peer: The peer to query * * Get a pointer to the address from a peer record. The caller is responsible * for making sure that the address is not deallocated. */ const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) { return peer ? &peer->srx : &rxrpc_null_addr; } EXPORT_SYMBOL(rxrpc_kernel_remote_srx); /** * rxrpc_kernel_remote_addr - Get the peer transport address of a call * @peer: The peer to query * * Get a pointer to the transport address from a peer record. The caller is * responsible for making sure that the address is not deallocated. */ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) { return (const struct sockaddr *) (peer ? &peer->srx.transport : &rxrpc_null_addr.transport); } EXPORT_SYMBOL(rxrpc_kernel_remote_addr);
1021 301 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMER_H #define _LINUX_TIMER_H #include <linux/list.h> #include <linux/ktime.h> #include <linux/stddef.h> #include <linux/debugobjects.h> #include <linux/stringify.h> #include <linux/timer_types.h> #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting the lockdep_map key * (second argument) here is required, otherwise it could be initialised to * the copy of the lockdep_map later! We use the pointer to and the string * "<file>:<line>" as the key resp. the name of the lockdep_map. */ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn), #else #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif /* * @TIMER_DEFERRABLE: A deferrable timer will work normally when the * system is busy, but will not cause a CPU to come out of idle just * to service it; instead, the timer will be serviced when the CPU * eventually wakes up with a subsequent non-deferrable timer. * * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and * it's safe to wait for the completion of the running instance from * IRQ handlers, for example, by calling del_timer_sync(). * * Note: The irq disabled callback execution is a special case for * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! * * @TIMER_PINNED: A pinned timer will always expire on the CPU on which the * timer was enqueued. When a particular CPU is required, add_timer_on() * has to be used. Enqueue via mod_timer() and add_timer() is always done * on the local CPU. */ #define TIMER_CPUMASK 0x0003FFFF #define TIMER_MIGRATING 0x00040000 #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) #define TIMER_DEFERRABLE 0x00080000 #define TIMER_PINNED 0x00100000 #define TIMER_IRQSAFE 0x00200000 #define TIMER_INIT_FLAGS (TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) #define TIMER_ARRAYSHIFT 22 #define TIMER_ARRAYMASK 0xFFC00000 #define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) #define __TIMER_INITIALIZER(_function, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .flags = (_flags), \ __TIMER_LOCKDEP_MAP_INITIALIZER(FILE_LINE) \ } #define DEFINE_TIMER(_name, _function) \ struct timer_list _name = \ __TIMER_INITIALIZER(_function, 0) /* * LOCKDEP and DEBUG timer interfaces. */ void init_timer_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); #else static inline void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { init_timer_key(timer, func, flags, name, key); } #endif #ifdef CONFIG_LOCKDEP #define __init_timer(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\ } while (0) #define __init_timer_on_stack(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ init_timer_on_stack_key((_timer), (_fn), (_flags), \ #_timer, &__key); \ } while (0) #else #define __init_timer(_timer, _fn, _flags) \ init_timer_key((_timer), (_fn), (_flags), NULL, NULL) #define __init_timer_on_stack(_timer, _fn, _flags) \ init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL) #endif /** * timer_setup - prepare a timer for first use * @timer: the timer in question * @callback: the function to call when timer expires * @flags: any TIMER_* flags * * Regular timer initialization should use either DEFINE_TIMER() above, * or timer_setup(). For timers on the stack, timer_setup_on_stack() must * be used and must be balanced with a call to destroy_timer_on_stack(). */ #define timer_setup(timer, callback, flags) \ __init_timer((timer), (callback), (flags)) #define timer_setup_on_stack(timer, callback, flags) \ __init_timer_on_stack((timer), (callback), (flags)) #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_timer_on_stack(struct timer_list *timer); #else static inline void destroy_timer_on_stack(struct timer_list *timer) { } #endif #define from_timer(var, callback_timer, timer_fieldname) \ container_of(callback_timer, typeof(*var), timer_fieldname) /** * timer_pending - is a timer pending? * @timer: the timer in question * * timer_pending will tell whether a given timer is currently pending, * or not. Callers must ensure serialization wrt. other operations done * to this timer, eg. interrupt contexts, or other CPUs on SMP. * * Returns: 1 if the timer is pending, 0 if not. */ static inline int timer_pending(const struct timer_list * timer) { return !hlist_unhashed_lockless(&timer->entry); } extern void add_timer_on(struct timer_list *timer, int cpu); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); extern int timer_reduce(struct timer_list *timer, unsigned long expires); /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) extern void add_timer(struct timer_list *timer); extern void add_timer_local(struct timer_list *timer); extern void add_timer_global(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int timer_delete_sync(struct timer_list *timer); extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); extern int timer_shutdown(struct timer_list *timer); /** * del_timer_sync - Delete a pending timer and wait for a running callback * @timer: The timer to be deleted * * See timer_delete_sync() for detailed explanation. * * Do not use in new code. Use timer_delete_sync() instead. * * Returns: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static inline int del_timer_sync(struct timer_list *timer) { return timer_delete_sync(timer); } /** * del_timer - Delete a pending timer * @timer: The timer to be deleted * * See timer_delete() for detailed explanation. * * Do not use in new code. Use timer_delete() instead. * * Returns: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static inline int del_timer(struct timer_list *timer) { return timer_delete(timer); } extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); unsigned long round_jiffies_relative(unsigned long j); unsigned long __round_jiffies_up(unsigned long j, int cpu); unsigned long __round_jiffies_up_relative(unsigned long j, int cpu); unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else #define timers_prepare_cpu NULL #define timers_dead_cpu NULL #endif #endif
18 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 /* SPDX-License-Identifier: GPL-2.0 */ /* interrupt.h */ #ifndef _LINUX_INTERRUPT_H #define _LINUX_INTERRUPT_H #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cleanup.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/hrtimer.h> #include <linux/kref.h> #include <linux/cpumask_types.h> #include <linux/workqueue.h> #include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/sections.h> /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When * requesting an interrupt without specifying a IRQF_TRIGGER, the * setting should be assumed to be "as already configured", which * may be as per machine or firmware initialisation. */ #define IRQF_TRIGGER_NONE 0x00000000 #define IRQF_TRIGGER_RISING 0x00000001 #define IRQF_TRIGGER_FALLING 0x00000002 #define IRQF_TRIGGER_HIGH 0x00000004 #define IRQF_TRIGGER_LOW 0x00000008 #define IRQF_TRIGGER_MASK (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW | \ IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING) #define IRQF_TRIGGER_PROBE 0x00000010 /* * These flags used only by the kernel as part of the * irq handling routines. * * IRQF_SHARED - allow sharing the irq among several devices * IRQF_PROBE_SHARED - set by callers when they expect sharing mismatches to occur * IRQF_TIMER - Flag to mark this interrupt as timer interrupt * IRQF_PERCPU - Interrupt is per cpu * IRQF_NOBALANCING - Flag to exclude this interrupt from irq balancing * IRQF_IRQPOLL - Interrupt is used for polling (only the interrupt that is * registered first in a shared interrupt is considered for * performance reasons) * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished. * Used by threaded interrupts which need to keep the * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee * that this interrupt will wake the system from a suspended * state. See Documentation/power/suspend-and-interrupts.rst * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device * resume time. * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this * interrupt handler after suspending interrupts. For system * wakeup devices users need to implement wakeup detection in * their interrupt handlers. * IRQF_NO_AUTOEN - Don't enable IRQ or NMI automatically when users request it. * Users will enable it explicitly by enable_irq() or enable_nmi() * later. * IRQF_NO_DEBUG - Exclude from runnaway detection for IPI and similar handlers, * depends on IRQF_PERCPU. * IRQF_COND_ONESHOT - Agree to do IRQF_ONESHOT if already set for a shared * interrupt. */ #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 #define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 #define IRQF_ONESHOT 0x00002000 #define IRQF_NO_SUSPEND 0x00004000 #define IRQF_FORCE_RESUME 0x00008000 #define IRQF_NO_THREAD 0x00010000 #define IRQF_EARLY_RESUME 0x00020000 #define IRQF_COND_SUSPEND 0x00040000 #define IRQF_NO_AUTOEN 0x00080000 #define IRQF_NO_DEBUG 0x00100000 #define IRQF_COND_ONESHOT 0x00200000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) /* * These values can be returned by request_any_context_irq() and * describe the context the interrupt will be run in. * * IRQC_IS_HARDIRQ - interrupt runs in hardirq context * IRQC_IS_NESTED - interrupt runs in a nested threaded context */ enum { IRQC_IS_HARDIRQ = 0, IRQC_IS_NESTED, }; typedef irqreturn_t (*irq_handler_t)(int, void *); /** * struct irqaction - per interrupt action descriptor * @handler: interrupt handler function * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) * @thread_fn: interrupt handler function for threaded interrupts * @thread: thread pointer for threaded interrupts * @secondary: pointer to secondary irqaction (force threading) * @thread_flags: flags related to @thread * @thread_mask: bitmask for keeping track of @thread activity * @dir: pointer to the proc/irq/NN/name entry */ struct irqaction { irq_handler_t handler; void *dev_id; void __percpu *percpu_dev_id; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; struct irqaction *secondary; unsigned int irq; unsigned int flags; unsigned long thread_flags; unsigned long thread_mask; const char *name; struct proc_dir_entry *dir; } ____cacheline_internodealigned_in_smp; extern irqreturn_t no_action(int cpl, void *dev_id); /* * If a (PCI) device interrupt is not connected we set dev->irq to * IRQ_NOTCONNECTED. This causes request_irq() to fail with -ENOTCONN, so we * can distingiush that case from other error returns. * * 0x80000000 is guaranteed to be outside the available range of interrupts * and easy to distinguish from other possible incorrect values. */ #define IRQ_NOTCONNECTED (1U << 31) extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long flags, const char *name, void *dev); /** * request_irq - Add a handler for an interrupt line * @irq: The interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts * If NULL, the default primary handler is installed * @flags: Handling flags * @name: Name of the device generating this interrupt * @dev: A cookie passed to the handler function * * This call allocates an interrupt and establishes a handler; see * the documentation for request_threaded_irq() for details. */ static inline int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev) { return request_threaded_irq(irq, handler, NULL, flags | IRQF_COND_ONESHOT, name, dev); } extern int __must_check request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id); extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev); static inline int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, devname, percpu_dev_id); } extern int __must_check request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *dev); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); extern const void *free_nmi(unsigned int irq, void *dev_id); extern void free_percpu_nmi(unsigned int irq, void __percpu *percpu_dev_id); struct device; extern int __must_check devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, unsigned long irqflags, const char *devname, void *dev_id); static inline int __must_check devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags, devname, dev_id); } extern int __must_check devm_request_any_context_irq(struct device *dev, unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); extern void disable_percpu_irq(unsigned int irq); extern void enable_irq(unsigned int irq); extern void enable_percpu_irq(unsigned int irq, unsigned int type); extern bool irq_percpu_is_enabled(unsigned int irq); extern void irq_wake_thread(unsigned int irq, void *dev_id); DEFINE_LOCK_GUARD_1(disable_irq, int, disable_irq(*_T->lock), enable_irq(*_T->lock)) extern void disable_nmi_nosync(unsigned int irq); extern void disable_percpu_nmi(unsigned int irq); extern void enable_nmi(unsigned int irq); extern void enable_percpu_nmi(unsigned int irq, unsigned int type); extern int prepare_percpu_nmi(unsigned int irq); extern void teardown_percpu_nmi(unsigned int irq); extern int irq_inject_interrupt(unsigned int irq); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); extern void rearm_wake_irq(unsigned int irq); /** * struct irq_affinity_notify - context for notification of IRQ affinity changes * @irq: Interrupt to which notification applies * @kref: Reference count, for internal use * @work: Work item, for internal use * @notify: Function to be called on change. This will be * called in process context. * @release: Function to be called on release. This will be * called in process context. Once registered, the * structure must only be freed when this function is * called or later. */ struct irq_affinity_notify { unsigned int irq; struct kref kref; struct work_struct work; void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); void (*release)(struct kref *ref); }; #define IRQ_AFFINITY_MAX_SETS 4 /** * struct irq_affinity - Description for automatic irq affinity assignments * @pre_vectors: Don't apply affinity to @pre_vectors at beginning of * the MSI(-X) vector space * @post_vectors: Don't apply affinity to @post_vectors at end of * the MSI(-X) vector space * @nr_sets: The number of interrupt sets for which affinity * spreading is required * @set_size: Array holding the size of each interrupt set * @calc_sets: Callback for calculating the number and size * of interrupt sets * @priv: Private data for usage by @calc_sets, usually a * pointer to driver/device specific data. */ struct irq_affinity { unsigned int pre_vectors; unsigned int post_vectors; unsigned int nr_sets; unsigned int set_size[IRQ_AFFINITY_MAX_SETS]; void (*calc_sets)(struct irq_affinity *, unsigned int nvecs); void *priv; }; /** * struct irq_affinity_desc - Interrupt affinity descriptor * @mask: cpumask to hold the affinity assignment * @is_managed: 1 if the interrupt is managed internally */ struct irq_affinity_desc { struct cpumask mask; unsigned int is_managed : 1; }; #if defined(CONFIG_SMP) extern cpumask_var_t irq_default_affinity; extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity); /** * irq_update_affinity_hint - Update the affinity hint * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint, but does not change the affinity of the interrupt. */ static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, false); } /** * irq_set_affinity_and_hint - Update the affinity hint and apply the provided * cpumask to the interrupt * @irq: Interrupt to update * @m: cpumask pointer (NULL to clear the hint) * * Updates the affinity hint and if @m is not NULL it applies it as the * affinity of that interrupt. */ static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return __irq_apply_affinity_hint(irq, m, true); } /* * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint() * instead. */ static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return irq_set_affinity_and_hint(irq, m); } extern int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd); unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) { return 0; } static inline int irq_can_set_affinity(unsigned int irq) { return 0; } static inline int irq_select_affinity(unsigned int irq) { return 0; } static inline int irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { return -EINVAL; } static inline int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { return -EINVAL; } static inline int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { return 0; } static inline struct irq_affinity_desc * irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd) { return NULL; } static inline unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, const struct irq_affinity *affd) { return maxvec; } #endif /* CONFIG_SMP */ /* * Special lockdep variants of irq disabling/enabling. * These should be used for locking constructs that * know that a particular irq context which is disabled, * and which is the only irq-context user of a lock, * that it's safe to take the lock in the irq-disabled * section without disabling hardirqs. * * On !CONFIG_LOCKDEP they are equivalent to the normal * irq disable/enable methods. */ static inline void disable_irq_nosync_lockdep(unsigned int irq) { disable_irq_nosync(irq); #ifdef CONFIG_LOCKDEP local_irq_disable(); #endif } static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags) { disable_irq_nosync(irq); #ifdef CONFIG_LOCKDEP local_irq_save(*flags); #endif } static inline void disable_irq_lockdep(unsigned int irq) { disable_irq(irq); #ifdef CONFIG_LOCKDEP local_irq_disable(); #endif } static inline void enable_irq_lockdep(unsigned int irq) { #ifdef CONFIG_LOCKDEP local_irq_enable(); #endif enable_irq(irq); } static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags) { #ifdef CONFIG_LOCKDEP local_irq_restore(*flags); #endif enable_irq(irq); } /* IRQ wakeup (PM) control: */ extern int irq_set_irq_wake(unsigned int irq, unsigned int on); static inline int enable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 1); } static inline int disable_irq_wake(unsigned int irq) { return irq_set_irq_wake(irq, 0); } /* * irq_get_irqchip_state/irq_set_irqchip_state specific flags */ enum irqchip_irq_state { IRQCHIP_STATE_PENDING, /* Is interrupt pending? */ IRQCHIP_STATE_ACTIVE, /* Is interrupt in progress? */ IRQCHIP_STATE_MASKED, /* Is interrupt masked? */ IRQCHIP_STATE_LINE_LEVEL, /* Is IRQ line high? */ }; extern int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state); extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool state); #ifdef CONFIG_IRQ_FORCED_THREADING # ifdef CONFIG_PREEMPT_RT # define force_irqthreads() (true) # else DECLARE_STATIC_KEY_FALSE(force_irqthreads_key); # define force_irqthreads() (static_branch_unlikely(&force_irqthreads_key)) # endif #else #define force_irqthreads() (false) #endif #ifndef local_softirq_pending #ifndef local_softirq_pending_ref #define local_softirq_pending_ref irq_stat.__softirq_pending #endif #define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref)) #define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x))) #define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x))) #endif /* local_softirq_pending */ /* Some architectures might implement lazy enabling/disabling of * interrupts. In some cases, such as stop_machine, we might want * to ensure that after a local_irq_disable(), interrupts have * really been disabled in hardware. Such architectures need to * implement the following hook. */ #ifndef hard_irq_disable #define hard_irq_disable() do { } while(0) #endif /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high frequency threaded job scheduling. For almost all the purposes tasklets are more than enough. F.e. all serial device BHs et al. should be converted to tasklets, not to softirqs. */ enum { HI_SOFTIRQ=0, TIMER_SOFTIRQ, NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, IRQ_POLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS }; /* * The following vectors can be safely ignored after ksoftirqd is parked: * * _ RCU: * 1) rcutree_migrate_callbacks() migrates the queue. * 2) rcutree_report_cpu_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue * * _ (HR)TIMER_SOFTIRQ: (hr)timers_dead_cpu() migrates the queue */ #define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(TIMER_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ) |\ BIT(HRTIMER_SOFTIRQ) | BIT(RCU_SOFTIRQ)) /* map softirq index to softirq name. update 'softirq_to_name' in * kernel/softirq.c when adding a new softirq. */ extern const char * const softirq_to_name[NR_SOFTIRQS]; /* softirq mask and active fields moved to irq_cpustat_t in * asm/hardirq.h to get better cache usage. KAO */ struct softirq_action { void (*action)(void); }; asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); #ifdef CONFIG_PREEMPT_RT extern void do_softirq_post_smp_call_flush(unsigned int was_pending); #else static inline void do_softirq_post_smp_call_flush(unsigned int unused) { do_softirq(); } #endif extern void open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); /* * With forced-threaded interrupts enabled a raised softirq is deferred to * ksoftirqd unless it can be handled within the threaded interrupt. This * affects timer_list timers and hrtimers which are explicitly marked with * HRTIMER_MODE_SOFT. * With PREEMPT_RT enabled more hrtimers are moved to softirq for processing * which includes all timers which are not explicitly marked HRTIMER_MODE_HARD. * Userspace controlled timers (like the clock_nanosleep() interface) is divided * into two categories: Tasks with elevated scheduling policy including * SCHED_{FIFO|RR|DL} and the remaining scheduling policy. The tasks with the * elevated scheduling policy are woken up directly from the HARDIRQ while all * other wake ups are delayed to softirq and so to ksoftirqd. * * The ksoftirqd runs at SCHED_OTHER policy at which it should remain since it * handles the softirq in an overloaded situation (not handled everything * within its last run). * If the timers are handled at SCHED_OTHER priority then they competes with all * other SCHED_OTHER tasks for CPU resources are possibly delayed. * Moving timers softirqs to a low priority SCHED_FIFO thread instead ensures * that timer are performed before scheduling any SCHED_OTHER thread. */ DECLARE_PER_CPU(struct task_struct *, ktimerd); DECLARE_PER_CPU(unsigned long, pending_timer_softirq); void raise_ktimers_thread(unsigned int nr); static inline unsigned int local_timers_pending_force_th(void) { return __this_cpu_read(pending_timer_softirq); } static inline void raise_timer_softirq(unsigned int nr) { lockdep_assert_in_irq(); if (force_irqthreads()) raise_ktimers_thread(nr); else __raise_softirq_irqoff(nr); } static inline unsigned int local_timers_pending(void) { if (force_irqthreads()) return local_timers_pending_force_th(); else return local_softirq_pending(); } DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) { return this_cpu_read(ksoftirqd); } /* Tasklets --- multithreaded analogue of BHs. This API is deprecated. Please consider using threaded IRQs instead: https://lore.kernel.org/lkml/20200716081538.2sivhkj4hcyrusem@linutronix.de Main feature differing them of generic softirqs: tasklet is running only on one CPU simultaneously. Main feature differing them of BHs: different tasklets may be run simultaneously on different CPUs. Properties: * If tasklet_schedule() is called, then tasklet is guaranteed to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its execution is still not started, it will be executed only once. * If this tasklet is already running on another CPU (or schedule is called from tasklet itself), it is rescheduled for later. * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. */ struct tasklet_struct { struct tasklet_struct *next; unsigned long state; atomic_t count; bool use_callback; union { void (*func)(unsigned long data); void (*callback)(struct tasklet_struct *t); }; unsigned long data; }; #define DECLARE_TASKLET(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .callback = _callback, \ .use_callback = true, \ } #define DECLARE_TASKLET_DISABLED(name, _callback) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .callback = _callback, \ .use_callback = true, \ } #define from_tasklet(var, callback_tasklet, tasklet_fieldname) \ container_of(callback_tasklet, typeof(*var), tasklet_fieldname) #define DECLARE_TASKLET_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(0), \ .func = _func, \ } #define DECLARE_TASKLET_DISABLED_OLD(name, _func) \ struct tasklet_struct name = { \ .count = ATOMIC_INIT(1), \ .func = _func, \ } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ }; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } void tasklet_unlock(struct tasklet_struct *t); void tasklet_unlock_wait(struct tasklet_struct *t); void tasklet_unlock_spin_wait(struct tasklet_struct *t); #else static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } static inline void tasklet_unlock(struct tasklet_struct *t) { } static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } #endif extern void __tasklet_schedule(struct tasklet_struct *t); static inline void tasklet_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_schedule(t); } extern void __tasklet_hi_schedule(struct tasklet_struct *t); static inline void tasklet_hi_schedule(struct tasklet_struct *t) { if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) __tasklet_hi_schedule(t); } static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); smp_mb__after_atomic(); } /* * Do not use in new code. Disabling tasklets from atomic contexts is * error prone and should be avoided. */ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_spin_wait(t); smp_mb(); } static inline void tasklet_disable(struct tasklet_struct *t) { tasklet_disable_nosync(t); tasklet_unlock_wait(t); smp_mb(); } static inline void tasklet_enable(struct tasklet_struct *t) { smp_mb__before_atomic(); atomic_dec(&t->count); } extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); extern void tasklet_setup(struct tasklet_struct *t, void (*callback)(struct tasklet_struct *)); /* * Autoprobing for irqs: * * probe_irq_on() and probe_irq_off() provide robust primitives * for accurate IRQ probing during kernel initialization. They are * reasonably simple to use, are not "fooled" by spurious interrupts, * and, unlike other attempts at IRQ probing, they do not get hung on * stuck interrupts (such as unused PS2 mouse interfaces on ASUS boards). * * For reasonably foolproof probing, use them as follows: * * 1. clear and/or mask the device's internal interrupt. * 2. sti(); * 3. irqs = probe_irq_on(); // "take over" all unassigned idle IRQs * 4. enable the device and cause it to trigger an interrupt. * 5. wait for the device to interrupt, using non-intrusive polling or a delay. * 6. irq = probe_irq_off(irqs); // get IRQ number, 0=none, negative=multiple * 7. service the device to clear its pending interrupt. * 8. loop again if paranoia is required. * * probe_irq_on() returns a mask of allocated irq's. * * probe_irq_off() takes the mask as a parameter, * and returns the irq number which occurred, * or zero if none occurred, or a negative irq number * if more than one irq occurred. */ #if !defined(CONFIG_GENERIC_IRQ_PROBE) static inline unsigned long probe_irq_on(void) { return 0; } static inline int probe_irq_off(unsigned long val) { return 0; } static inline unsigned int probe_irq_mask(unsigned long val) { return 0; } #else extern unsigned long probe_irq_on(void); /* returns 0 on failure */ extern int probe_irq_off(unsigned long); /* returns 0 or negative on failure */ extern unsigned int probe_irq_mask(unsigned long); /* returns mask of ISA interrupts */ #endif #ifdef CONFIG_PROC_FS /* Initialize /proc/irq/ */ extern void init_irq_proc(void); #else static inline void init_irq_proc(void) { } #endif #ifdef CONFIG_IRQ_TIMINGS void irq_timings_enable(void); void irq_timings_disable(void); u64 irq_timings_next_event(u64 now); #endif struct seq_file; int show_interrupts(struct seq_file *p, void *v); int arch_show_interrupts(struct seq_file *p, int prec); extern int early_irq_init(void); extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); /* * We want to know which function is an entrypoint of a hardirq or a softirq. */ #ifndef __irq_entry # define __irq_entry __section(".irqentry.text") #endif #define __softirq_entry __section(".softirqentry.text") #endif
10 10 10 10 10 10 10 10 9 9 1 9 9 7 9 9 7 7 7 7 10 10 20 20 32 32 20 10 2 32 62 62 62 1 62 62 62 62 62 62 64 72 72 72 72 72 72 72 72 72 72 71 72 72 72 72 72 72 72 1 62 62 62 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * Roger Venning <r.venning@telstra.com>: 6to4 support * Nate Thompson <nate@thebog.net>: 6to4 support * Fred Templin <fred.l.templin@boeing.com>: isatap support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip.h> #include <net/udp.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/inet_dscp.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c For comments look at net/ipv4/ip_gre.c --ANK */ #define IP6_SIT_HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); static void ipip6_dev_free(struct net_device *dev); static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst); static struct rtnl_link_ops sit_link_ops __read_mostly; static unsigned int sit_net_id __read_mostly; struct sit_net { struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_wc[1]; struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; static inline struct sit_net *dev_to_sit_net(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); return net_generic(t->net, sit_net_id); } /* * Must be invoked with rcu_read_lock */ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, struct net_device *dev, __be32 remote, __be32 local, int sifindex) { unsigned int h0 = HASH(remote); unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); int ifindex = dev ? dev->ifindex : 0; for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } t = rcu_dereference(sitn->tunnels_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } static struct ip_tunnel __rcu ** __ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; unsigned int h = 0; int prio = 0; if (remote) { prio |= 2; h ^= HASH(remote); } if (local) { prio |= 1; h ^= HASH(local); } return &sitn->tunnels[prio][h]; } static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, struct ip_tunnel *t) { return __ipip6_bucket(sitn, &t->parms); } static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp; struct ip_tunnel *iter; for (tp = ipip6_bucket(sitn, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) { #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel *t = netdev_priv(dev); if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) { ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); t->ip6rd.relay_prefix = 0; t->ip6rd.prefixlen = 16; t->ip6rd.relay_prefixlen = 0; } else { struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev); memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd)); } #endif } static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); int err; __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags)) dev->priv_flags |= IFF_ISATAP; dev->rtnl_link_ops = &sit_link_ops; err = register_netdevice(dev); if (err < 0) goto out; ipip6_tunnel_clone_6rd(dev, sitn); ipip6_tunnel_link(sitn, t); return 0; out: return err; } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, struct ip_tunnel_parm_kern *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; struct ip_tunnel *t, *nt; struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct sit_net *sitn = net_generic(net, sit_net_id); for (tp = __ipip6_bucket(sitn, parms); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && parms->link == t->parms.link) { if (create) return NULL; else return t; } } if (!create) goto failed; if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; strscpy(name, parms->name, IFNAMSIZ); } else { strcpy(name, "sit%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!dev) return NULL; dev_net_set(dev, net); nt = netdev_priv(dev); nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; if (!parms->name[0]) strcpy(parms->name, dev->name); return nt; failed_free: free_netdev(dev); failed: return NULL; } #define for_each_prl_rcu(start) \ for (prl = rcu_dereference(start); \ prl; \ prl = rcu_dereference(prl->next)) static struct ip_tunnel_prl_entry * __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) { struct ip_tunnel_prl_entry *prl; for_each_prl_rcu(t->prl) if (prl->addr == addr) break; return prl; } static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; unsigned int cmax, c = 0, ca, len; int ret = 0; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; if (copy_from_user(&kprl, a, sizeof(kprl))) return -EFAULT; cmax = kprl.datalen / sizeof(kprl); if (cmax > 1 && kprl.addr != htonl(INADDR_ANY)) cmax = 1; /* For simple GET or for root users, * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; ca = min(t->prl_count, cmax); if (!kp) { /* We don't try hard to allocate much memory for * non-root users. * For root users, retry allocating enough memory for * the answer. */ kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; } } rcu_read_lock(); for_each_prl_rcu(t->prl) { if (c >= cmax) break; if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr) continue; kp[c].addr = prl->addr; kp[c].flags = prl->flags; c++; if (kprl.addr != htonl(INADDR_ANY)) break; } rcu_read_unlock(); len = sizeof(*kp) * c; ret = 0; if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen)) ret = -EFAULT; kfree(kp); out: return ret; } static int ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) { struct ip_tunnel_prl_entry *p; int err = 0; if (a->addr == htonl(INADDR_ANY)) return -EINVAL; ASSERT_RTNL(); for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { if (p->addr == a->addr) { if (chg) { p->flags = a->flags; goto out; } err = -EEXIST; goto out; } } if (chg) { err = -ENXIO; goto out; } p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL); if (!p) { err = -ENOBUFS; goto out; } p->next = t->prl; p->addr = a->addr; p->flags = a->flags; t->prl_count++; rcu_assign_pointer(t->prl, p); out: return err; } static void prl_list_destroy_rcu(struct rcu_head *head) { struct ip_tunnel_prl_entry *p, *n; p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); do { n = rcu_dereference_protected(p->next, 1); kfree(p); p = n; } while (p); } static int ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) { struct ip_tunnel_prl_entry *x; struct ip_tunnel_prl_entry __rcu **p; int err = 0; ASSERT_RTNL(); if (a && a->addr != htonl(INADDR_ANY)) { for (p = &t->prl; (x = rtnl_dereference(*p)) != NULL; p = &x->next) { if (x->addr == a->addr) { *p = x->next; kfree_rcu(x, rcu_head); t->prl_count--; goto out; } } err = -ENXIO; } else { x = rtnl_dereference(t->prl); if (x) { t->prl_count = 0; call_rcu(&x->rcu_head, prl_list_destroy_rcu); t->prl = NULL; } } out: return err; } static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ip_tunnel_prl __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl prl; int err; if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; if (copy_from_user(&prl, data, sizeof(prl))) return -EFAULT; switch (cmd) { case SIOCDELPRL: err = ipip6_tunnel_del_prl(t, &prl); break; case SIOCADDPRL: case SIOCCHGPRL: err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); return err; } static int isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) { struct ip_tunnel_prl_entry *p; int ok = 1; rcu_read_lock(); p = __ipip6_tunnel_locate_prl(t, iph->saddr); if (p) { if (p->flags & PRL_DEFAULT) skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT; else skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT; } else { const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; if (ipv6_addr_is_isatap(addr6) && (addr6->s6_addr32[3] == iph->saddr) && ipv6_chk_prefix(addr6, t->dev)) skb->ndisc_nodetype = NDISC_NODETYPE_HOST; else ok = 0; } rcu_read_unlock(); return ok; } static void ipip6_tunnel_uninit(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct sit_net *sitn = net_generic(tunnel->net, sit_net_id); if (dev == sitn->fb_tunnel_dev) { RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL); } else { ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } dst_cache_reset(&tunnel->dst_cache); netdev_put(dev, &tunnel->dev_tracker); } static int ipip6_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; unsigned int data_len = 0; struct ip_tunnel *t; int sifindex; int err; switch (type) { default: case ICMP_PARAMETERPROB: return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: /* Impossible event. */ return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: break; } err = -ENOENT; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->daddr, iph->saddr, sifindex); if (!t) goto out; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, t->parms.link, iph->protocol); err = 0; goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, iph->protocol); err = 0; goto out; } err = 0; if (__in6_dev_get(skb->dev) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) goto out; if (t->parms.iph.daddr == 0) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: return err; } static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, const struct in6_addr *v6addr) { __be32 v4embed = 0; if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed) return true; return false; } /* Checks if an address matches an address on the tunnel interface. * Used to detect the NAT of proto 41 packets and let them pass spoofing test. * Long story: * This function is called after we considered the packet as spoofed * in is_spoofed_6rd. * We may have a router that is doing NAT for proto 41 packets * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd * function will return true, dropping the packet. * But, we can still check if is spoofed against the IP * addresses associated with the interface. */ static bool only_dnatted(const struct ip_tunnel *tunnel, const struct in6_addr *v6dst) { int prefix_len; #ifdef CONFIG_IPV6_SIT_6RD prefix_len = tunnel->ip6rd.prefixlen + 32 - tunnel->ip6rd.relay_prefixlen; #else prefix_len = 48; #endif return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev); } /* Returns true if a packet is spoofed */ static bool packet_is_spoofed(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *tunnel) { const struct ipv6hdr *ipv6h; if (tunnel->dev->priv_flags & IFF_ISATAP) { if (!isatap_chksrc(skb, iph, tunnel)) return true; return false; } if (tunnel->dev->flags & IFF_POINTOPOINT) return false; ipv6h = ipv6_hdr(skb); if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) { net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n", &iph->saddr, &ipv6h->saddr, &iph->daddr, &ipv6h->daddr); return true; } if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr))) return false; if (only_dnatted(tunnel, &ipv6h->daddr)) return false; net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n", &iph->saddr, &ipv6h->saddr, &iph->daddr, &ipv6h->daddr); return true; } static int ipip6_rcv(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; int sifindex; int err; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr, sifindex); if (tunnel) { if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && tunnel->parms.iph.protocol != 0) goto out; skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->dev = tunnel->dev; if (packet_is_spoofed(skb, iph, tunnel)) { DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6), !net_eq(tunnel->net, dev_net(tunnel->dev)))) goto out; /* skb can be uncloned in iptunnel_pull_header, so * old iph is no longer valid */ iph = (const struct iphdr *)skb_mac_header(skb); skb_reset_mac_header(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); netif_rx(skb); return 0; } /* no tunnel matched, let upstream know, ipsec may handle it */ return 1; out: kfree_skb(skb); return 0; } static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; #if IS_ENABLED(CONFIG_MPLS) static const struct tnl_ptk_info mplsip_tpi = { /* no tunnel info required for mplsip. */ .proto = htons(ETH_P_MPLS_UC), }; #endif static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; int sifindex; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; iph = ip_hdr(skb); tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr, sifindex); if (tunnel) { const struct tnl_ptk_info *tpi; if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; #if IS_ENABLED(CONFIG_MPLS) if (ipproto == IPPROTO_MPLS) tpi = &mplsip_tpi; else #endif tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; skb_reset_mac_header(skb); return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return 1; drop: kfree_skb(skb); return 0; } static int ipip_rcv(struct sk_buff *skb) { return sit_tunnel_rcv(skb, IPPROTO_IPIP); } #if IS_ENABLED(CONFIG_MPLS) static int mplsip_rcv(struct sk_buff *skb) { return sit_tunnel_rcv(skb, IPPROTO_MPLS); } #endif /* * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function * stores the embedded IPv4 address in v4dst and returns true. */ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst) { #ifdef CONFIG_IPV6_SIT_6RD if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, tunnel->ip6rd.prefixlen)) { unsigned int pbw0, pbi0; int pbi1; u32 d; pbw0 = tunnel->ip6rd.prefixlen >> 5; pbi0 = tunnel->ip6rd.prefixlen & 0x1f; d = tunnel->ip6rd.relay_prefixlen < 32 ? (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> tunnel->ip6rd.relay_prefixlen : 0; pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen; if (pbi1 > 0) d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> (32 - pbi1); *v4dst = tunnel->ip6rd.relay_prefix | htonl(d); return true; } #else if (v6dst->s6_addr16[0] == htons(0x2002)) { /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ memcpy(v4dst, &v6dst->s6_addr16[1], 4); return true; } #endif return false; } static inline __be32 try_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst) { __be32 dst = 0; check_6rd(tunnel, v6dst, &dst); return dst; } /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; const struct in6_addr *addr6; int addr_type; u8 ttl; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); if (tos == 1) tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ if (dev->priv_flags & IFF_ISATAP) { struct neighbour *neigh = NULL; bool do_tx_error = false; if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if ((addr_type & IPV6_ADDR_UNICAST) && ipv6_addr_is_isatap(addr6)) dst = addr6->s6_addr32[3]; else do_tx_error = true; neigh_release(neigh); if (do_tx_error) goto tx_error; } if (!dst) dst = try_6rd(tunnel, &iph6->daddr); if (!dst) { struct neighbour *neigh = NULL; bool do_tx_error = false; if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } if ((addr_type & IPV6_ADDR_COMPATv4) != 0) dst = addr6->s6_addr32[3]; else do_tx_error = true; neigh_release(neigh); if (do_tx_error) goto tx_error; } flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_flow(tunnel->net, &fl4, NULL); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) { ip_rt_put(rt); goto tx_error; } if (df) { mtu = dst_mtu(&rt->dst) - t_hlen; if (mtu < IPV4_MIN_MTU) { DEV_STATS_INC(dev, collisions); ip_rt_put(rt); goto tx_error; } if (mtu < IPV6_MIN_MTU) { mtu = IPV6_MIN_MTU; df = 0; } if (tunnel->parms.iph.daddr) skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } } if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; iph6 = ipv6_hdr(skb); } ttl = tiph->ttl; if (ttl == 0) ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) { ip_rt_put(rt); goto tx_error; } skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, struct net_device *dev, u8 ipproto) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; skb_set_inner_ipproto(skb, ipproto); ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); break; case htons(ETH_P_IPV6): ipip6_tunnel_xmit(skb, dev); break; #if IS_ENABLED(CONFIG_MPLS) case htons(ETH_P_MPLS_UC): sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS); break; #endif default: goto tx_err; } return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static void ipip6_tunnel_bind_dev(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); struct net_device *tdev = NULL; int hlen = LL_MAX_HEADER; const struct iphdr *iph; struct flowi4 fl4; iph = &tunnel->parms.iph; if (iph->daddr) { struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4, NULL, iph->daddr, iph->saddr, 0, 0, IPPROTO_IPV6, iph->tos & INET_DSCP_MASK, tunnel->parms.link); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev && !netif_is_l3_master(tdev)) { int mtu; mtu = tdev->mtu - t_hlen; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); hlen = tdev->hard_header_len + tdev->needed_headroom; } dev->needed_headroom = t_hlen + hlen; } static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm_kern *p, __u32 fwmark) { struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); ipip6_tunnel_unlink(sitn, t); synchronize_net(); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; __dev_addr_set(t->dev, &p->iph.saddr, 4); memcpy(t->dev->broadcast, &p->iph.daddr, 4); ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; t->fwmark = fwmark; ipip6_tunnel_bind_dev(t->dev); } dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } #ifdef CONFIG_IPV6_SIT_6RD static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, struct ip_tunnel_6rd *ip6rd) { struct in6_addr prefix; __be32 relay_prefix; if (ip6rd->relay_prefixlen > 32 || ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64) return -EINVAL; ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen); if (!ipv6_addr_equal(&prefix, &ip6rd->prefix)) return -EINVAL; if (ip6rd->relay_prefixlen) relay_prefix = ip6rd->relay_prefix & htonl(0xffffffffUL << (32 - ip6rd->relay_prefixlen)); else relay_prefix = 0; if (relay_prefix != ip6rd->relay_prefix) return -EINVAL; t->ip6rd.prefix = prefix; t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm_kern p; struct ip_tunnel_6rd ip6rd; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { if (!ip_tunnel_parm_from_user(&p, data)) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } if (!t) t = netdev_priv(dev); ip6rd.prefix = t->ip6rd.prefix; ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; if (copy_to_user(data, &ip6rd, sizeof(ip6rd))) return -EFAULT; return 0; } static int ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; int err; if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ip6rd, data, sizeof(ip6rd))) return -EFAULT; if (cmd != SIOCDEL6RD) { err = ipip6_tunnel_update_6rd(t, &ip6rd); if (err < 0) return err; } else ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev)); return 0; } #endif /* CONFIG_IPV6_SIT_6RD */ static bool ipip6_valid_ip_proto(u8 ipproto) { return ipproto == IPPROTO_IPV6 || ipproto == IPPROTO_IPIP || #if IS_ENABLED(CONFIG_MPLS) ipproto == IPPROTO_MPLS || #endif ipproto == 0; } static int __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!ipip6_valid_ip_proto(p->iph.protocol)) return -EINVAL; if (p->iph.version != 4 || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); return 0; } static int ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) t = ipip6_tunnel_locate(t->net, p, 0); if (!t) t = netdev_priv(dev); memcpy(p, &t->parms, sizeof(*p)); return 0; } static int ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; t = ipip6_tunnel_locate(t->net, p, 1); if (!t) return -ENOBUFS; return 0; } static int ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); int err; err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; t = ipip6_tunnel_locate(t->net, p, 0); if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { if (!t) return -ENOENT; } else { if (t) { if (t->dev != dev) return -EEXIST; } else { if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr)) return -EINVAL; t = netdev_priv(dev); } ipip6_tunnel_update(t, p, t->fwmark); } return 0; } static int ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p) { struct ip_tunnel *t = netdev_priv(dev); if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { t = ipip6_tunnel_locate(t->net, p, 0); if (!t) return -ENOENT; if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev)) return -EPERM; dev = t->dev; } unregister_netdevice(dev); return 0; } static int ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) { switch (cmd) { case SIOCGETTUNNEL: return ipip6_tunnel_get(dev, p); case SIOCADDTUNNEL: return ipip6_tunnel_add(dev, p); case SIOCCHGTUNNEL: return ipip6_tunnel_change(dev, p); case SIOCDELTUNNEL: return ipip6_tunnel_del(dev, p); default: return -EINVAL; } } static int ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { switch (cmd) { case SIOCGETTUNNEL: case SIOCADDTUNNEL: case SIOCCHGTUNNEL: case SIOCDELTUNNEL: return ip_tunnel_siocdevprivate(dev, ifr, data, cmd); case SIOCGETPRL: return ipip6_tunnel_get_prl(dev, data); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: return ipip6_tunnel_prl_ctl(dev, data, cmd); #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: return ipip6_tunnel_get6rd(dev, data); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: return ipip6_tunnel_6rdctl(dev, data, cmd); #endif default: return -EINVAL; } } static const struct net_device_ops ipip6_netdev_ops = { .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); dst_cache_destroy(&tunnel->dst_cache); } #define SIT_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip6_tunnel_setup(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->netdev_ops = &ipip6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - t_hlen; dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; dev->lltx = true; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; } static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int err; tunnel->dev = dev; tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (err) return err; netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; } static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); iph->version = 4; iph->protocol = IPPROTO_IPV6; iph->ihl = 5; iph->ttl = 64; rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); } static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (!ipip6_valid_ip_proto(proto)) return -EINVAL; return 0; } static void ipip6_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPV6; parms->iph.ihl = 5; parms->iph.ttl = 64; if (!data) return; ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } #ifdef CONFIG_IPV6_SIT_6RD /* This function returns true when 6RD attributes are present in the nl msg */ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], struct ip_tunnel_6rd *ip6rd) { bool ret = false; memset(ip6rd, 0, sizeof(*ip6rd)); if (!data) return ret; if (data[IFLA_IPTUN_6RD_PREFIX]) { ret = true; ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { ret = true; ip6rd->relay_prefix = nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]); } if (data[IFLA_IPTUN_6RD_PREFIXLEN]) { ret = true; ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) { ret = true; ip6rd->relay_prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]); } return ret; } #endif static int ipip6_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif int err; nt = netdev_priv(dev); if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); if (err < 0) return err; } ipip6_netlink_parms(data, &nt->parms, &nt->fwmark); if (ipip6_tunnel_locate(net, &nt->parms, 0)) return -EEXIST; err = ipip6_tunnel_create(dev); if (err < 0) return err; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); if (mtu >= IPV6_MIN_MTU && mtu <= IP6_MAX_MTU - dev->hard_header_len) dev->mtu = mtu; } #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) { err = ipip6_tunnel_update_6rd(nt, &ip6rd); if (err < 0) unregister_netdevice_queue(dev, NULL); } #endif return err; } static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif __u32 fwmark = t->fwmark; int err; if (dev == sitn->fb_tunnel_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip6_netlink_parms(data, &p, &fwmark); if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; t = ipip6_tunnel_locate(net, &p, 0); if (t) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ipip6_tunnel_update(t, &p, fwmark); #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) return ipip6_tunnel_update_6rd(t, &ip6rd); #endif return 0; } static size_t ipip6_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(4) + /* IFLA_IPTUN_REMOTE */ nla_total_size(4) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + #ifdef CONFIG_IPV6_SIT_6RD /* IFLA_IPTUN_6RD_PREFIX */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_6RD_RELAY_PREFIX */ nla_total_size(4) + /* IFLA_IPTUN_6RD_PREFIXLEN */ nla_total_size(2) + /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ nla_total_size(2) + #endif /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm_kern *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_be16(skb, IFLA_IPTUN_FLAGS, ip_tunnel_flags_to_be16(parm->i_flags)) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; #ifdef CONFIG_IPV6_SIT_6RD if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX, &tunnel->ip6rd.prefix) || nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, tunnel->ip6rd.relay_prefix) || nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN, tunnel->ip6rd.prefixlen) || nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, tunnel->ip6rd.relay_prefixlen)) goto nla_put_failure; #endif if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, #ifdef CONFIG_IPV6_SIT_6RD [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 }, [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, #endif [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static void ipip6_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); if (dev != sitn->fb_tunnel_dev) unregister_netdevice_queue(dev, head); } static struct rtnl_link_ops sit_link_ops __read_mostly = { .kind = "sit", .maxtype = IFLA_IPTUN_MAX, .policy = ipip6_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip6_tunnel_setup, .validate = ipip6_validate, .newlink = ipip6_newlink, .changelink = ipip6_changelink, .get_size = ipip6_get_size, .fill_info = ipip6_fill_info, .dellink = ipip6_dellink, .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, }; static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip6_err, .priority = 2, }; #if IS_ENABLED(CONFIG_MPLS) static struct xfrm_tunnel mplsip_handler __read_mostly = { .handler = mplsip_rcv, .err_handler = ipip6_err, .priority = 2, }; #endif static void __net_exit sit_destroy_tunnels(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; int prio; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &sit_link_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; t = rtnl_dereference(sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); t = rtnl_dereference(t->next); } } } } static int __net_init sit_init_net(struct net *net) { struct sit_net *sitn = net_generic(net, sit_net_id); struct ip_tunnel *t; int err; sitn->tunnels[0] = sitn->tunnels_wc; sitn->tunnels[1] = sitn->tunnels_l; sitn->tunnels[2] = sitn->tunnels_r; sitn->tunnels[3] = sitn->tunnels_r_l; if (!net_has_fallback_tunnels(net)) return 0; sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!sitn->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; } dev_net_set(sitn->fb_tunnel_dev, net); sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ sitn->fb_tunnel_dev->netns_local = true; err = register_netdev(sitn->fb_tunnel_dev); if (err) goto err_reg_dev; ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); t = netdev_priv(sitn->fb_tunnel_dev); strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; err_reg_dev: free_netdev(sitn->fb_tunnel_dev); err_alloc_dev: return err; } static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) sit_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, .exit_batch_rtnl = sit_exit_batch_rtnl, .id = &sit_net_id, .size = sizeof(struct sit_net), }; static void __exit sit_cleanup(void) { rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm4_tunnel_deregister(&ipip_handler, AF_INET); #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); #endif unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } static int __init sit_init(void) { int err; pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&sit_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&sit_handler, AF_INET6); if (err < 0) { pr_info("%s: can't register ip6ip4\n", __func__); goto xfrm_tunnel_failed; } err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register ip4ip4\n", __func__); goto xfrm_tunnel4_failed; } #if IS_ENABLED(CONFIG_MPLS) err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); if (err < 0) { pr_info("%s: can't register mplsip\n", __func__); goto xfrm_tunnel_mpls_failed; } #endif err = rtnl_link_register(&sit_link_ops); if (err < 0) goto rtnl_link_failed; out: return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mpls_failed: #endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm_tunnel_failed: unregister_pernet_device(&sit_net_ops); goto out; } module_init(sit_init); module_exit(sit_cleanup); MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0");
281 26 29 29 191 1 41 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kernfs.h - pseudo filesystem decoupled from vfs locking */ #ifndef __LINUX_KERNFS_H #define __LINUX_KERNFS_H #include <linux/err.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/lockdep.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/types.h> #include <linux/uidgid.h> #include <linux/wait.h> #include <linux/rwsem.h> #include <linux/cache.h> struct file; struct dentry; struct iattr; struct seq_file; struct vm_area_struct; struct vm_operations_struct; struct super_block; struct file_system_type; struct poll_table_struct; struct fs_context; struct kernfs_fs_context; struct kernfs_open_node; struct kernfs_iattrs; /* * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash * table of locks. * Having a small hash table would impact scalability, since * more and more kernfs_node objects will end up using same lock * and having a very large hash table would waste memory. * * At the moment size of hash table of locks is being set based on * the number of CPUs as follows: * * NR_CPU NR_KERNFS_LOCK_BITS NR_KERNFS_LOCKS * 1 1 2 * 2-3 2 4 * 4-7 4 16 * 8-15 6 64 * 16-31 8 256 * 32 and more 10 1024 * * The above relation between NR_CPU and number of locks is based * on some internal experimentation which involved booting qemu * with different values of smp, performing some sysfs operations * on all CPUs and observing how increase in number of locks impacts * completion time of these sysfs operations on each CPU. */ #ifdef CONFIG_SMP #define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32))) #else #define NR_KERNFS_LOCK_BITS 1 #endif #define NR_KERNFS_LOCKS (1 << NR_KERNFS_LOCK_BITS) /* * There's one kernfs_open_file for each open file and one kernfs_open_node * for each kernfs_node with one or more open files. * * filp->private_data points to seq_file whose ->private points to * kernfs_open_file. * * kernfs_open_files are chained at kernfs_open_node->files, which is * protected by kernfs_global_locks.open_file_mutex[i]. * * To reduce possible contention in sysfs access, arising due to single * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node * object address as hash keys to get the index of these locks. * * Hashed mutexes are safe to use here because operations using these don't * rely on global exclusion. * * In future we intend to replace other global locks with hashed ones as well. * kernfs_global_locks acts as a holder for all such hash tables. */ struct kernfs_global_locks { struct mutex open_file_mutex[NR_KERNFS_LOCKS]; }; enum kernfs_node_type { KERNFS_DIR = 0x0001, KERNFS_FILE = 0x0002, KERNFS_LINK = 0x0004, }; #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK #define KERNFS_MAX_USER_XATTRS 128 #define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, KERNFS_NS = 0x0020, KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, KERNFS_HIDDEN = 0x0200, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, KERNFS_HAS_RELEASE = 0x2000, KERNFS_REMOVING = 0x4000, }; /* @flags for kernfs_create_root() */ enum kernfs_root_flag { /* * kernfs_nodes are created in the deactivated state and invisible. * They require explicit kernfs_activate() to become visible. This * can be used to make related nodes become visible atomically * after all nodes are created successfully. */ KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, /* * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2) * succeeds regardless of the RW permissions. sysfs had an extra * layer of enforcement where open(2) fails with -EACCES regardless * of CAP_DAC_OVERRIDE if the permission doesn't have the * respective read or write access at all (none of S_IRUGO or * S_IWUGO) or the respective operation isn't implemented. The * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, /* * The filesystem supports exportfs operation, so userspace can use * fhandle to access nodes of the fs. */ KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, /* * Support user xattrs to be written to nodes rooted at this root. */ KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, }; /* type-specific structures for kernfs_node union members */ struct kernfs_elem_dir { unsigned long subdirs; /* children rbtree starts here and goes through kn->rb */ struct rb_root children; /* * The kernfs hierarchy this directory belongs to. This fits * better directly in kernfs_node but is here to save space. */ struct kernfs_root *root; /* * Monotonic revision counter, used to identify if a directory * node has changed during negative dentry revalidation. */ unsigned long rev; }; struct kernfs_elem_symlink { struct kernfs_node *target_kn; }; struct kernfs_elem_attr { const struct kernfs_ops *ops; struct kernfs_open_node __rcu *open; loff_t size; struct kernfs_node *notify_next; /* for kernfs_notify() */ }; /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are * private to kernfs and shouldn't be accessed directly by kernfs users. * * As long as count reference is held, the kernfs_node itself is * accessible. Dereferencing elem or any other outer entity requires * active reference. */ struct kernfs_node { atomic_t count; atomic_t active; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* * Use kernfs_get_parent() and kernfs_name/path() instead of * accessing the following two fields directly. If the node is * never moved to a different parent, it is safe to access the * parent directly. */ struct kernfs_node *parent; const char *name; struct rb_node rb; const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ unsigned short flags; umode_t mode; union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; struct kernfs_elem_attr attr; }; /* * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, * the low 32bits are ino and upper generation. */ u64 id; void *priv; struct kernfs_iattrs *iattr; struct rcu_head rcu; }; /* * kernfs_syscall_ops may be specified on kernfs_create_root() to support * syscalls. These optional callbacks are invoked on the matching syscalls * and can perform any kernfs operations which don't necessarily have to be * the exact operation requested. An active reference is held for each * kernfs_node parameter. */ struct kernfs_syscall_ops { int (*show_options)(struct seq_file *sf, struct kernfs_root *root); int (*mkdir)(struct kernfs_node *parent, const char *name, umode_t mode); int (*rmdir)(struct kernfs_node *kn); int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name); int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, struct kernfs_root *root); }; struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root); struct kernfs_open_file { /* published fields */ struct kernfs_node *kn; struct file *file; struct seq_file *seq_file; void *priv; /* private fields, do not use outside kernfs proper */ struct mutex mutex; struct mutex prealloc_mutex; int event; struct list_head list; char *prealloc_buf; size_t atomic_write_len; bool mmapped:1; bool released:1; const struct vm_operations_struct *vm_ops; }; struct kernfs_ops { /* * Optional open/release methods. Both are called with * @of->seq_file populated. */ int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * Read is handled by either seq_file or raw_read(). * * If seq_show() is present, seq_file path is active. Other seq * operations are optional and if not implemented, the behavior is * equivalent to single_open(). @sf->private points to the * associated kernfs_open_file. * * read() is bounced through kernel buffer and a read larger than * PAGE_SIZE results in partial operation of PAGE_SIZE. */ int (*seq_show)(struct seq_file *sf, void *v); void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); /* * write() is bounced through kernel buffer. If atomic_write_len * is not set, a write larger than PAGE_SIZE results in partial * operations of PAGE_SIZE chunks. If atomic_write_len is set, * writes upto the specified size are executed atomically but * larger ones are rejected with -E2BIG. */ size_t atomic_write_len; /* * "prealloc" causes a buffer to be allocated at open for * all read/write requests. As ->seq_show uses seq_read() * which does its own allocation, it is incompatible with * ->prealloc. Provide ->read and ->write with ->prealloc. */ bool prealloc; ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence); }; /* * The kernfs superblock creation/mount parameter context. */ struct kernfs_fs_context { struct kernfs_root *root; /* Root of the hierarchy being mounted */ void *ns_tag; /* Namespace tag of the mount (or NULL) */ unsigned long magic; /* File system specific magic number */ /* The following are set/used by kernfs_mount() */ bool new_sb_created; /* Set to T if we allocated a new sb */ }; #ifdef CONFIG_KERNFS static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return kn->flags & KERNFS_TYPE_MASK; } static inline ino_t kernfs_id_ino(u64 id) { /* id is ino if ino_t is 64bit; otherwise, low 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return id; else return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return 1; else return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) { return kernfs_id_ino(kn->id); } static inline ino_t kernfs_gen(struct kernfs_node *kn) { return kernfs_id_gen(kn->id); } /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty * * This is to be called right after @kn is created to enable namespace * under it. All children of @kn must have non-NULL namespace tags and * only the ones which match the super_block's tag will be visible. */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children)); kn->flags |= KERNFS_NS; } /** * kernfs_ns_enabled - test whether namespace is enabled * @kn: the node to test * * Test whether namespace filtering is enabled for the children of @ns. */ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return kn->flags & KERNFS_NS; } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target); void kernfs_activate(struct kernfs_node *kn); void kernfs_show(struct kernfs_node *kn, bool show); void kernfs_remove(struct kernfs_node *kn); void kernfs_break_active_protection(struct kernfs_node *kn); void kernfs_unbreak_active_protection(struct kernfs_node *kn); bool kernfs_remove_self(struct kernfs_node *kn); int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns); int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size); int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags); const void *kernfs_super_ns(struct super_block *sb); int kernfs_get_tree(struct fs_context *fc); void kernfs_free_fs_context(struct fs_context *fc); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return 0; } /* whatever */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { } static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return false; } static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { return NULL; } static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } static inline struct kernfs_node * kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { return NULL; } static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } static inline struct inode * kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { return NULL; } static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { return ERR_PTR(-ENOSYS); } static inline void kernfs_activate(struct kernfs_node *kn) { } static inline void kernfs_remove(struct kernfs_node *kn) { } static inline bool kernfs_remove_self(struct kernfs_node *kn) { return false; } static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn, const char *name, const void *ns) { return -ENOSYS; } static inline int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { return -ENOSYS; } static inline int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { return -ENOSYS; } static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt) { return -ENOSYS; } static inline void kernfs_notify(struct kernfs_node *kn) { } static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { return -ENOSYS; } static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { return -ENOSYS; } static inline const void *kernfs_super_ns(struct super_block *sb) { return NULL; } static inline int kernfs_get_tree(struct fs_context *fc) { return -ENOSYS; } static inline void kernfs_free_fs_context(struct fs_context *fc) { } static inline void kernfs_kill_sb(struct super_block *sb) { } static inline void kernfs_init(void) { } #endif /* CONFIG_KERNFS */ /** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * If @kn is NULL result will be "(null)". * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return kernfs_path_from_node(kn, NULL, buf, buflen); } static inline struct kernfs_node * kernfs_find_and_get(struct kernfs_node *kn, const char *name) { return kernfs_find_and_get_ns(kn, name, NULL); } static inline struct kernfs_node * kernfs_walk_and_get(struct kernfs_node *kn, const char *path) { return kernfs_walk_and_get_ns(kn, path, NULL); } static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { return kernfs_create_dir_ns(parent, name, mode, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, const char *name) { return kernfs_remove_by_name_ns(parent, name, NULL); } static inline int kernfs_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name) { return kernfs_rename_ns(kn, new_parent, new_name, NULL); } #endif /* __LINUX_KERNFS_H */
7 7 7 7 7 7 7 7 197 197 197 197 197 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 MLME SAP interface * * Copyright (c) 2009, Jouni Malinen <j@w1.fi> * Copyright (c) 2015 Intel Deutschland GmbH * Copyright (C) 2019-2020, 2022-2024 Intel Corporation */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/slab.h> #include <linux/wireless.h> #include <net/cfg80211.h> #include <net/iw_handler.h> #include "core.h" #include "nl80211.h" #include "rdev-ops.h" void cfg80211_rx_assoc_resp(struct net_device *dev, const struct cfg80211_rx_assoc_resp_data *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)data->buf; struct cfg80211_connect_resp_params cr = { .timeout_reason = NL80211_TIMEOUT_UNSPECIFIED, .req_ie = data->req_ies, .req_ie_len = data->req_ies_len, .resp_ie = mgmt->u.assoc_resp.variable, .resp_ie_len = data->len - offsetof(struct ieee80211_mgmt, u.assoc_resp.variable), .status = le16_to_cpu(mgmt->u.assoc_resp.status_code), .ap_mld_addr = data->ap_mld_addr, }; unsigned int link_id; for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { cr.links[link_id].status = data->links[link_id].status; cr.links[link_id].bss = data->links[link_id].bss; WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS && (!cr.ap_mld_addr || !cr.links[link_id].bss)); if (!cr.links[link_id].bss) continue; cr.links[link_id].bssid = data->links[link_id].bss->bssid; cr.links[link_id].addr = data->links[link_id].addr; /* need to have local link addresses for MLO connections */ WARN_ON(cr.ap_mld_addr && !is_valid_ether_addr(cr.links[link_id].addr)); BUG_ON(!cr.links[link_id].bss->channel); if (cr.links[link_id].bss->channel->band == NL80211_BAND_S1GHZ) { WARN_ON(link_id); cr.resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable; cr.resp_ie_len = data->len - offsetof(struct ieee80211_mgmt, u.s1g_assoc_resp.variable); } if (cr.ap_mld_addr) cr.valid_links |= BIT(link_id); } trace_cfg80211_send_rx_assoc(dev, data); /* * This is a bit of a hack, we don't notify userspace of * a (re-)association reply if we tried to send a reassoc * and got a reject -- we only try again with an assoc * frame instead of reassoc. */ if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) { for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { struct cfg80211_bss *bss = data->links[link_id].bss; if (!bss) continue; cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } return; } nl80211_send_rx_assoc(rdev, dev, data); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS); } EXPORT_SYMBOL(cfg80211_rx_assoc_resp); static void cfg80211_process_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); nl80211_send_rx_auth(rdev, wdev->netdev, buf, len, GFP_KERNEL); cfg80211_sme_rx_auth(wdev, buf, len); } static void cfg80211_process_deauth(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid)) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_deauth(wdev); } static void cfg80211_process_disassoc(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (WARN_ON(!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_disassoc(wdev); } void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; lockdep_assert_wiphy(wdev->wiphy); trace_cfg80211_rx_mlme_mgmt(dev, buf, len); if (WARN_ON(len < 2)) return; if (ieee80211_is_auth(mgmt->frame_control)) cfg80211_process_auth(wdev, buf, len); else if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, false); else if (ieee80211_is_disassoc(mgmt->frame_control)) cfg80211_process_disassoc(wdev, buf, len, false); } EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt); void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_send_auth_timeout(dev, addr); nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL); cfg80211_sme_auth_timeout(wdev); } EXPORT_SYMBOL(cfg80211_auth_timeout); void cfg80211_assoc_failure(struct net_device *dev, struct cfg80211_assoc_failure *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const u8 *addr = data->ap_mld_addr ?: data->bss[0]->bssid; int i; trace_cfg80211_send_assoc_failure(dev, data); if (data->timeout) { nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL); cfg80211_sme_assoc_timeout(wdev); } else { cfg80211_sme_abandon_assoc(wdev); } for (i = 0; i < ARRAY_SIZE(data->bss); i++) { struct cfg80211_bss *bss = data->bss[i]; if (!bss) continue; cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } } EXPORT_SYMBOL(cfg80211_assoc_failure); void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len, bool reconnect) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; lockdep_assert_wiphy(wdev->wiphy); trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect); if (WARN_ON(len < 2)) return; if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, reconnect); else cfg80211_process_disassoc(wdev, buf, len, reconnect); } EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt); void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr, enum nl80211_key_type key_type, int key_id, const u8 *tsc, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; char *buf = kmalloc(128, gfp); if (buf) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = sprintf(buf, "MLME-MICHAELMICFAILURE." "indication(keyid=%d %scast addr=%pM)", key_id, key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni", addr); wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf); kfree(buf); } #endif trace_cfg80211_michael_mic_failure(dev, addr, key_type, key_id, tsc); nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp); } EXPORT_SYMBOL(cfg80211_michael_mic_failure); /* some MLME handling for userspace SME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; lockdep_assert_wiphy(wdev->wiphy); if (!req->bss) return -ENOENT; if (req->link_id >= 0 && !(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) return -EINVAL; if (req->auth_type == NL80211_AUTHTYPE_SHARED_KEY) { if (!req->key || !req->key_len || req->key_idx < 0 || req->key_idx > 3) return -EINVAL; } if (wdev->connected && ether_addr_equal(req->bss->bssid, wdev->u.client.connected_addr)) return -EALREADY; if (ether_addr_equal(req->bss->bssid, dev->dev_addr) || (req->link_id >= 0 && ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) return -EINVAL; return rdev_auth(rdev, dev, req); } /* Do a logical ht_capa &= ht_capa_mask. */ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, const struct ieee80211_ht_cap *ht_capa_mask) { int i; u8 *p1, *p2; if (!ht_capa_mask) { memset(ht_capa, 0, sizeof(*ht_capa)); return; } p1 = (u8*)(ht_capa); p2 = (u8*)(ht_capa_mask); for (i = 0; i < sizeof(*ht_capa); i++) p1[i] &= p2[i]; } /* Do a logical vht_capa &= vht_capa_mask. */ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask) { int i; u8 *p1, *p2; if (!vht_capa_mask) { memset(vht_capa, 0, sizeof(*vht_capa)); return; } p1 = (u8*)(vht_capa); p2 = (u8*)(vht_capa_mask); for (i = 0; i < sizeof(*vht_capa); i++) p1[i] &= p2[i]; } static int cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a, const struct ieee80211_multi_link_elem *mle_b, struct netlink_ext_ack *extack) { const struct ieee80211_mle_basic_common_info *common_a, *common_b; common_a = (const void *)mle_a->variable; common_b = (const void *)mle_b->variable; if (memcmp(common_a->mld_mac_addr, common_b->mld_mac_addr, ETH_ALEN)) { NL_SET_ERR_MSG(extack, "AP MLD address mismatch"); return -EINVAL; } if (ieee80211_mle_get_eml_cap((const u8 *)mle_a) != ieee80211_mle_get_eml_cap((const u8 *)mle_b)) { NL_SET_ERR_MSG(extack, "link EML capabilities mismatch"); return -EINVAL; } if (ieee80211_mle_get_mld_capa_op((const u8 *)mle_a) != ieee80211_mle_get_mld_capa_op((const u8 *)mle_b)) { NL_SET_ERR_MSG(extack, "link MLD capabilities/ops mismatch"); return -EINVAL; } return 0; } static int cfg80211_mlme_check_mlo(struct net_device *dev, struct cfg80211_assoc_request *req, struct netlink_ext_ack *extack) { const struct ieee80211_multi_link_elem *mles[ARRAY_SIZE(req->links)] = {}; int i; if (req->link_id < 0) return 0; if (!req->links[req->link_id].bss) { NL_SET_ERR_MSG(extack, "no BSS for assoc link"); return -EINVAL; } rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(req->links); i++) { const struct cfg80211_bss_ies *ies; const struct element *ml; if (!req->links[i].bss) continue; if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) { NL_SET_ERR_MSG(extack, "BSSID must not be our address"); req->links[i].error = -EINVAL; goto error; } ies = rcu_dereference(req->links[i].bss->ies); ml = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, ies->data, ies->len); if (!ml) { NL_SET_ERR_MSG(extack, "MLO BSS w/o ML element"); req->links[i].error = -EINVAL; goto error; } if (!ieee80211_mle_type_ok(ml->data + 1, IEEE80211_ML_CONTROL_TYPE_BASIC, ml->datalen - 1)) { NL_SET_ERR_MSG(extack, "BSS with invalid ML element"); req->links[i].error = -EINVAL; goto error; } mles[i] = (const void *)(ml->data + 1); if (ieee80211_mle_get_link_id((const u8 *)mles[i]) != i) { NL_SET_ERR_MSG(extack, "link ID mismatch"); req->links[i].error = -EINVAL; goto error; } } if (WARN_ON(!mles[req->link_id])) goto error; for (i = 0; i < ARRAY_SIZE(req->links); i++) { if (i == req->link_id || !req->links[i].bss) continue; if (WARN_ON(!mles[i])) goto error; if (cfg80211_mlme_check_mlo_compat(mles[req->link_id], mles[i], extack)) { req->links[i].error = -EINVAL; goto error; } } rcu_read_unlock(); return 0; error: rcu_read_unlock(); return -EINVAL; } /* Note: caller must cfg80211_put_bss() regardless of result */ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req, struct netlink_ext_ack *extack) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); err = cfg80211_mlme_check_mlo(dev, req, extack); if (err) return err; if (wdev->connected && (!req->prev_bssid || !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid))) return -EALREADY; if ((req->bss && ether_addr_equal(req->bss->bssid, dev->dev_addr)) || (req->link_id >= 0 && ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) return -EINVAL; cfg80211_oper_and_ht_capa(&req->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&req->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); err = rdev_assoc(rdev, dev, req); if (!err) { int link_id; if (req->bss) { cfg80211_ref_bss(&rdev->wiphy, req->bss); cfg80211_hold_bss(bss_from_pub(req->bss)); } for (link_id = 0; link_id < ARRAY_SIZE(req->links); link_id++) { if (!req->links[link_id].bss) continue; cfg80211_ref_bss(&rdev->wiphy, req->links[link_id].bss); cfg80211_hold_bss(bss_from_pub(req->links[link_id].bss)); } } return err; } int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_deauth_request req = { .bssid = bssid, .reason_code = reason, .ie = ie, .ie_len = ie_len, .local_state_change = local_state_change, }; lockdep_assert_wiphy(wdev->wiphy); if (local_state_change && (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return 0; if (ether_addr_equal(wdev->disconnect_bssid, bssid) || (wdev->connected && ether_addr_equal(wdev->u.client.connected_addr, bssid))) wdev->conn_owner_nlportid = 0; return rdev_deauth(rdev, dev, &req); } int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *ap_addr, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_disassoc_request req = { .reason_code = reason, .local_state_change = local_state_change, .ie = ie, .ie_len = ie_len, .ap_addr = ap_addr, }; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->connected) return -ENOTCONN; if (memcmp(wdev->u.client.connected_addr, ap_addr, ETH_ALEN)) return -ENOTCONN; err = rdev_disassoc(rdev, dev, &req); if (err) return err; /* driver should have reported the disassoc */ WARN_ON(wdev->connected); return 0; } void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; u8 bssid[ETH_ALEN]; lockdep_assert_wiphy(wdev->wiphy); if (!rdev->ops->deauth) return; if (!wdev->connected) return; memcpy(bssid, wdev->u.client.connected_addr, ETH_ALEN); cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); } struct cfg80211_mgmt_registration { struct list_head list; struct wireless_dev *wdev; u32 nlportid; int match_len; __le16 frame_type; bool multicast_rx; u8 match[]; }; static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct wireless_dev *tmp; struct cfg80211_mgmt_registration *reg; struct mgmt_frame_regs upd = {}; lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_bh(&rdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { spin_unlock_bh(&rdev->mgmt_registrations_lock); return; } rcu_read_lock(); list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { list_for_each_entry(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); u32 mcast_mask = 0; if (reg->multicast_rx) mcast_mask = mask; upd.global_stypes |= mask; upd.global_mcast_stypes |= mcast_mask; if (tmp == wdev) { upd.interface_stypes |= mask; upd.interface_mcast_stypes |= mcast_mask; } } } rcu_read_unlock(); wdev->mgmt_registrations_need_update = 0; spin_unlock_bh(&rdev->mgmt_registrations_lock); rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len, bool multicast_rx, struct netlink_ext_ack *extack) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) { NL_SET_ERR_MSG(extack, "frame type not management"); return -EINVAL; } if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) { NL_SET_ERR_MSG(extack, "Invalid frame type"); return -EINVAL; } mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) { NL_SET_ERR_MSG(extack, "Registration to specific type not supported"); return -EINVAL; } /* * To support Pre Association Security Negotiation (PASN), registration * for authentication frames should be supported. However, as some * versions of the user space daemons wrongly register to all types of * authentication frames (which might result in unexpected behavior) * allow such registration if the request is for a specific * authentication algorithm number. */ if (wdev->iftype == NL80211_IFTYPE_STATION && (frame_type & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_AUTH && !(match_data && match_len >= 2)) { NL_SET_ERR_MSG(extack, "Authentication algorithm number required"); return -EINVAL; } nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); if (!nreg) return -ENOMEM; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); if (frame_type != le16_to_cpu(reg->frame_type)) continue; if (memcmp(reg->match, match_data, mlen) == 0) { if (reg->multicast_rx != multicast_rx) { update_multicast = true; reg->multicast_rx = multicast_rx; break; } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; } } if (err) goto out; if (update_multicast) { kfree(nreg); } else { memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; nreg->nlportid = snd_portid; nreg->frame_type = cpu_to_le16(frame_type); nreg->wdev = wdev; nreg->multicast_rx = multicast_rx; list_add(&nreg->list, &wdev->mgmt_registrations); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); return 0; out: kfree(nreg); spin_unlock_bh(&rdev->mgmt_registrations_lock); return err; } void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlportid != nlportid) continue; list_del(&reg->list); kfree(reg); wdev->mgmt_registrations_need_update = 1; schedule_work(&rdev->mgmt_registrations_update_wk); } spin_unlock_bh(&rdev->mgmt_registrations_lock); if (nlportid && rdev->crit_proto_nlportid == nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } if (nlportid == wdev->ap_unexpected_nlportid) wdev->ap_unexpected_nlportid = 0; } void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(&reg->list); kfree(reg); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); } static bool cfg80211_allowed_address(struct wireless_dev *wdev, const u8 *addr) { int i; for_each_valid_link(wdev, i) { if (ether_addr_equal(addr, wdev->links[i].addr)) return true; } return ether_addr_equal(addr, wdev_address(wdev)); } static bool cfg80211_allowed_random_address(struct wireless_dev *wdev, const struct ieee80211_mgmt *mgmt) { if (ieee80211_is_auth(mgmt->frame_control) || ieee80211_is_deauth(mgmt->frame_control)) { /* Allow random TA to be used with authentication and * deauthentication frames if the driver has indicated support. */ if (wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA)) return true; } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { /* Allow random TA to be used with Public Action frames if the * driver has indicated support. */ if (!wdev->connected && wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA)) return true; if (wdev->connected && wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED)) return true; } return false; } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { const struct ieee80211_mgmt *mgmt; u16 stype; lockdep_assert_wiphy(&rdev->wiphy); if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; if (params->len < 24 + 1) return -EINVAL; mgmt = (const struct ieee80211_mgmt *)params->buf; if (!ieee80211_is_mgmt(mgmt->frame_control)) return -EINVAL; stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4))) return -EINVAL; if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { int err = 0; switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: /* * check for IBSS DA must be done by driver as * cfg80211 doesn't track the stations */ if (!wdev->u.ibss.current_bss || !ether_addr_equal(wdev->u.ibss.current_bss->pub.bssid, mgmt->bssid)) { err = -ENOTCONN; break; } break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!wdev->connected) { err = -ENOTCONN; break; } /* FIXME: MLD may address this differently */ if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->bssid)) { err = -ENOTCONN; break; } /* for station, check that DA is the AP */ if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->da)) { err = -ENOTCONN; break; } break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP_VLAN: if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev)) && (params->link_id < 0 || !ether_addr_equal(mgmt->bssid, wdev->links[params->link_id].addr))) err = -EINVAL; break; case NL80211_IFTYPE_MESH_POINT: if (!ether_addr_equal(mgmt->sa, mgmt->bssid)) { err = -EINVAL; break; } /* * check for mesh DA must be done by driver as * cfg80211 doesn't track the stations */ break; case NL80211_IFTYPE_P2P_DEVICE: /* * fall through, P2P device only supports * public action frames */ case NL80211_IFTYPE_NAN: default: err = -EOPNOTSUPP; break; } if (err) return err; } if (!cfg80211_allowed_address(wdev, mgmt->sa) && !cfg80211_allowed_random_address(wdev, mgmt)) return -EINVAL; /* Transmit the management frame as requested by user space */ return rdev_mgmt_tx(rdev, wdev, params, cookie); } bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev, struct cfg80211_rx_info *info) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg; const struct ieee80211_txrx_stypes *stypes = &wiphy->mgmt_stypes[wdev->iftype]; struct ieee80211_mgmt *mgmt = (void *)info->buf; const u8 *data; int data_len; bool result = false; __le16 ftype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); u16 stype; trace_cfg80211_rx_mgmt(wdev, info); stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; if (!(stypes->rx & BIT(stype))) { trace_cfg80211_return_bool(false); return false; } data = info->buf + ieee80211_hdrlen(mgmt->frame_control); data_len = info->len - ieee80211_hdrlen(mgmt->frame_control); spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { if (reg->frame_type != ftype) continue; if (reg->match_len > data_len) continue; if (memcmp(reg->match, data, reg->match_len)) continue; /* found match! */ /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, info, GFP_ATOMIC)) continue; result = true; break; } spin_unlock_bh(&rdev->mgmt_registrations_lock); trace_cfg80211_return_bool(result); return result; } EXPORT_SYMBOL(cfg80211_rx_mgmt_ext); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { cancel_delayed_work(&rdev->dfs_update_channels_wk); queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, 0); } void cfg80211_dfs_channels_update_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; struct cfg80211_chan_def chandef; struct ieee80211_supported_band *sband; struct ieee80211_channel *c; struct wiphy *wiphy; bool check_again = false; unsigned long timeout, next_time = 0; unsigned long time_dfs_update; enum nl80211_radar_event radar_event; int bandid, i; rdev = container_of(delayed_work, struct cfg80211_registered_device, dfs_update_channels_wk); wiphy = &rdev->wiphy; rtnl_lock(); for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) { sband = wiphy->bands[bandid]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { c = &sband->channels[i]; if (!(c->flags & IEEE80211_CHAN_RADAR)) continue; if (c->dfs_state != NL80211_DFS_UNAVAILABLE && c->dfs_state != NL80211_DFS_AVAILABLE) continue; if (c->dfs_state == NL80211_DFS_UNAVAILABLE) { time_dfs_update = IEEE80211_DFS_MIN_NOP_TIME_MS; radar_event = NL80211_RADAR_NOP_FINISHED; } else { if (regulatory_pre_cac_allowed(wiphy) || cfg80211_any_wiphy_oper_chan(wiphy, c)) continue; time_dfs_update = REG_PRE_CAC_EXPIRY_GRACE_MS; radar_event = NL80211_RADAR_PRE_CAC_EXPIRED; } timeout = c->dfs_state_entered + msecs_to_jiffies(time_dfs_update); if (time_after_eq(jiffies, timeout)) { c->dfs_state = NL80211_DFS_USABLE; c->dfs_state_entered = jiffies; cfg80211_chandef_create(&chandef, c, NL80211_CHAN_NO_HT); nl80211_radar_notify(rdev, &chandef, radar_event, NULL, GFP_ATOMIC); regulatory_propagate_dfs_state(wiphy, &chandef, c->dfs_state, radar_event); continue; } if (!check_again) next_time = timeout - jiffies; else next_time = min(next_time, timeout - jiffies); check_again = true; } } rtnl_unlock(); /* reschedule if there are other channels waiting to be cleared again */ if (check_again) queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, next_time); } void __cfg80211_radar_event(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, bool offchan, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_radar_event(wiphy, chandef, offchan); /* only set the chandef supplied channel to unavailable, in * case the radar is detected on only one of multiple channels * spanned by the chandef. */ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); if (offchan) queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); cfg80211_sched_dfs_chan_update(rdev); nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); memcpy(&rdev->radar_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); } EXPORT_SYMBOL(__cfg80211_radar_event); void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event, gfp_t gfp, unsigned int link_id) { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; if (WARN_ON(wdev->valid_links && !(wdev->valid_links & BIT(link_id)))) return; trace_cfg80211_cac_event(netdev, event, link_id); if (WARN_ON(!wdev->links[link_id].cac_started && event != NL80211_RADAR_CAC_STARTED)) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: timeout = wdev->links[link_id].cac_start_time + msecs_to_jiffies(wdev->links[link_id].cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); fallthrough; case NL80211_RADAR_CAC_ABORTED: wdev->links[link_id].cac_started = false; break; case NL80211_RADAR_CAC_STARTED: wdev->links[link_id].cac_started = true; break; default: WARN_ON(1); return; } nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } EXPORT_SYMBOL(cfg80211_cac_event); static void __cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { struct wiphy *wiphy = &rdev->wiphy; struct net_device *netdev; lockdep_assert_wiphy(&rdev->wiphy); if (!cfg80211_chandef_valid(chandef)) return; if (!rdev->background_radar_wdev) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, sizeof(*chandef)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); wdev = rdev->background_radar_wdev; break; case NL80211_RADAR_CAC_ABORTED: if (!cancel_delayed_work(&rdev->background_cac_done_wk)) return; wdev = rdev->background_radar_wdev; break; case NL80211_RADAR_CAC_STARTED: break; default: return; } netdev = wdev ? wdev->netdev : NULL; nl80211_radar_notify(rdev, chandef, event, netdev, GFP_KERNEL); } static void cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { wiphy_lock(&rdev->wiphy); __cfg80211_background_cac_event(rdev, rdev->background_radar_wdev, chandef, event); wiphy_unlock(&rdev->wiphy); } void cfg80211_background_cac_done_wk(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; rdev = container_of(delayed_work, struct cfg80211_registered_device, background_cac_done_wk); cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_FINISHED); } void cfg80211_background_cac_abort_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, background_cac_abort_wk); cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_ABORTED); } void cfg80211_background_cac_abort(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); } EXPORT_SYMBOL(cfg80211_background_cac_abort); int cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) { unsigned int cac_time_ms; int err; lockdep_assert_wiphy(&rdev->wiphy); if (!wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_RADAR_BACKGROUND)) return -EOPNOTSUPP; /* Offchannel chain already locked by another wdev */ if (rdev->background_radar_wdev && rdev->background_radar_wdev != wdev) return -EBUSY; /* CAC already in progress on the offchannel chain */ if (rdev->background_radar_wdev == wdev && delayed_work_pending(&rdev->background_cac_done_wk)) return -EBUSY; err = rdev_set_radar_background(rdev, chandef); if (err) return err; cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, chandef); if (!cac_time_ms) cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; rdev->background_radar_chandef = *chandef; rdev->background_radar_wdev = wdev; /* Get offchain ownership */ __cfg80211_background_cac_event(rdev, wdev, chandef, NL80211_RADAR_CAC_STARTED); queue_delayed_work(cfg80211_wq, &rdev->background_cac_done_wk, msecs_to_jiffies(cac_time_ms)); return 0; } void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); lockdep_assert_wiphy(wiphy); if (wdev != rdev->background_radar_wdev) return; rdev_set_radar_background(rdev, NULL); rdev->background_radar_wdev = NULL; /* Release offchain ownership */ __cfg80211_background_cac_event(rdev, wdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_ABORTED); }
17 18 18 18 18 18 3 1 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 10 8 18 8 19 10 10 1 1 10 8 19 19 19 19 19 19 19 19 19 3 19 2 1 1 1 2 1 1 13 13 1 1 1 3 3 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/rculist.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_stp.h" /* since time values in bpdu are in jiffies and then scaled (1/256) * before sending, make sure that is at least one STP tick. */ #define MESSAGE_AGE_INCR ((HZ / 256) + 1) static const char *const br_port_state_names[] = { [BR_STATE_DISABLED] = "disabled", [BR_STATE_LISTENING] = "listening", [BR_STATE_LEARNING] = "learning", [BR_STATE_FORWARDING] = "forwarding", [BR_STATE_BLOCKING] = "blocking", }; void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, .flags = SWITCHDEV_F_DEFER, .u.stp_state = state, }; int err; /* Don't change the state of the ports if they are driven by a different * protocol. */ if (p->flags & BR_MRP_AWARE) return; p->state = state; if (br_opt_get(p->br, BROPT_MST_ENABLED)) { err = br_mst_set_state(p, 0, state, NULL); if (err) br_warn(p->br, "error setting MST state on port %u(%s)\n", p->port_no, netdev_name(p->dev)); } err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); else br_info(p->br, "port %u(%s) entered %s state\n", (unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]); if (p->br->stp_enabled == BR_KERNEL_STP) { switch (p->state) { case BR_STATE_BLOCKING: p->stp_xstats.transition_blk++; break; case BR_STATE_FORWARDING: p->stp_xstats.transition_fwd++; break; } } } u8 br_port_get_stp_state(const struct net_device *dev) { struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_rtnl(dev); if (!p) return BR_STATE_DISABLED; return p->state; } EXPORT_SYMBOL_GPL(br_port_get_stp_state); /* called under bridge lock */ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { struct net_bridge_port *p; list_for_each_entry_rcu(p, &br->port_list, list, lockdep_is_held(&br->lock)) { if (p->port_no == port_no) return p; } return NULL; } /* called under bridge lock */ static int br_should_become_root_port(const struct net_bridge_port *p, u16 root_port) { struct net_bridge *br; struct net_bridge_port *rp; int t; br = p->br; if (p->state == BR_STATE_DISABLED || br_is_designated_port(p)) return 0; if (memcmp(&br->bridge_id, &p->designated_root, 8) <= 0) return 0; if (!root_port) return 1; rp = br_get_port(br, root_port); t = memcmp(&p->designated_root, &rp->designated_root, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->designated_cost + p->path_cost < rp->designated_cost + rp->path_cost) return 1; else if (p->designated_cost + p->path_cost > rp->designated_cost + rp->path_cost) return 0; t = memcmp(&p->designated_bridge, &rp->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->designated_port < rp->designated_port) return 1; else if (p->designated_port > rp->designated_port) return 0; if (p->port_id < rp->port_id) return 1; return 0; } static void br_root_port_block(const struct net_bridge *br, struct net_bridge_port *p) { br_notice(br, "port %u(%s) tried to become root port (blocked)", (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay > 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } /* called under bridge lock */ static void br_root_selection(struct net_bridge *br) { struct net_bridge_port *p; u16 root_port = 0; list_for_each_entry(p, &br->port_list, list) { if (!br_should_become_root_port(p, root_port)) continue; if (p->flags & BR_ROOT_BLOCK) br_root_port_block(br, p); else root_port = p->port_no; } br->root_port = root_port; if (!root_port) { br->designated_root = br->bridge_id; br->root_path_cost = 0; } else { p = br_get_port(br, root_port); br->designated_root = p->designated_root; br->root_path_cost = p->designated_cost + p->path_cost; } } /* called under bridge lock */ void br_become_root_bridge(struct net_bridge *br) { br->max_age = br->bridge_max_age; br->hello_time = br->bridge_hello_time; br->forward_delay = br->bridge_forward_delay; br_topology_change_detection(br); del_timer(&br->tcn_timer); if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); mod_timer(&br->hello_timer, jiffies + br->hello_time); } } /* called under bridge lock */ void br_transmit_config(struct net_bridge_port *p) { struct br_config_bpdu bpdu; struct net_bridge *br; if (timer_pending(&p->hold_timer)) { p->config_pending = 1; return; } br = p->br; bpdu.topology_change = br->topology_change; bpdu.topology_change_ack = p->topology_change_ack; bpdu.root = br->designated_root; bpdu.root_path_cost = br->root_path_cost; bpdu.bridge_id = br->bridge_id; bpdu.port_id = p->port_id; if (br_is_root_bridge(br)) bpdu.message_age = 0; else { struct net_bridge_port *root = br_get_port(br, br->root_port); bpdu.message_age = (jiffies - root->designated_age) + MESSAGE_AGE_INCR; } bpdu.max_age = br->max_age; bpdu.hello_time = br->hello_time; bpdu.forward_delay = br->forward_delay; if (bpdu.message_age < br->max_age) { br_send_config_bpdu(p, &bpdu); p->topology_change_ack = 0; p->config_pending = 0; if (p->br->stp_enabled == BR_KERNEL_STP) mod_timer(&p->hold_timer, round_jiffies(jiffies + BR_HOLD_TIME)); } } /* called under bridge lock */ static void br_record_config_information(struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { p->designated_root = bpdu->root; p->designated_cost = bpdu->root_path_cost; p->designated_bridge = bpdu->bridge_id; p->designated_port = bpdu->port_id; p->designated_age = jiffies - bpdu->message_age; mod_timer(&p->message_age_timer, jiffies + (bpdu->max_age - bpdu->message_age)); } /* called under bridge lock */ static void br_record_config_timeout_values(struct net_bridge *br, const struct br_config_bpdu *bpdu) { br->max_age = bpdu->max_age; br->hello_time = bpdu->hello_time; br->forward_delay = bpdu->forward_delay; __br_set_topology_change(br, bpdu->topology_change); } /* called under bridge lock */ void br_transmit_tcn(struct net_bridge *br) { struct net_bridge_port *p; p = br_get_port(br, br->root_port); if (p) br_send_tcn_bpdu(p); else br_notice(br, "root port %u not found for topology notice\n", br->root_port); } /* called under bridge lock */ static int br_should_become_designated_port(const struct net_bridge_port *p) { struct net_bridge *br; int t; br = p->br; if (br_is_designated_port(p)) return 1; if (memcmp(&p->designated_root, &br->designated_root, 8)) return 1; if (br->root_path_cost < p->designated_cost) return 1; else if (br->root_path_cost > p->designated_cost) return 0; t = memcmp(&br->bridge_id, &p->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->port_id < p->designated_port) return 1; return 0; } /* called under bridge lock */ static void br_designated_port_selection(struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_should_become_designated_port(p)) br_become_designated_port(p); } } /* called under bridge lock */ static int br_supersedes_port_info(const struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { int t; t = memcmp(&bpdu->root, &p->designated_root, 8); if (t < 0) return 1; else if (t > 0) return 0; if (bpdu->root_path_cost < p->designated_cost) return 1; else if (bpdu->root_path_cost > p->designated_cost) return 0; t = memcmp(&bpdu->bridge_id, &p->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (memcmp(&bpdu->bridge_id, &p->br->bridge_id, 8)) return 1; if (bpdu->port_id <= p->designated_port) return 1; return 0; } /* called under bridge lock */ static void br_topology_change_acknowledged(struct net_bridge *br) { br->topology_change_detected = 0; del_timer(&br->tcn_timer); } /* called under bridge lock */ void br_topology_change_detection(struct net_bridge *br) { int isroot = br_is_root_bridge(br); if (br->stp_enabled != BR_KERNEL_STP) return; br_info(br, "topology change detected, %s\n", isroot ? "propagating" : "sending tcn bpdu"); if (isroot) { __br_set_topology_change(br, 1); mod_timer(&br->topology_change_timer, jiffies + br->bridge_forward_delay + br->bridge_max_age); } else if (!br->topology_change_detected) { br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } br->topology_change_detected = 1; } /* called under bridge lock */ void br_config_bpdu_generation(struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_is_designated_port(p)) br_transmit_config(p); } } /* called under bridge lock */ static void br_reply(struct net_bridge_port *p) { br_transmit_config(p); } /* called under bridge lock */ void br_configuration_update(struct net_bridge *br) { br_root_selection(br); br_designated_port_selection(br); } /* called under bridge lock */ void br_become_designated_port(struct net_bridge_port *p) { struct net_bridge *br; br = p->br; p->designated_root = br->designated_root; p->designated_cost = br->root_path_cost; p->designated_bridge = br->bridge_id; p->designated_port = p->port_id; } /* called under bridge lock */ static void br_make_blocking(struct net_bridge_port *p) { if (p->state != BR_STATE_DISABLED && p->state != BR_STATE_BLOCKING) { if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); del_timer(&p->forward_delay_timer); } } /* called under bridge lock */ static void br_make_forwarding(struct net_bridge_port *p) { struct net_bridge *br = p->br; if (p->state != BR_STATE_BLOCKING) return; if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { br_set_state(p, BR_STATE_FORWARDING); br_topology_change_detection(br); del_timer(&p->forward_delay_timer); } else if (br->stp_enabled == BR_KERNEL_STP) br_set_state(p, BR_STATE_LISTENING); else br_set_state(p, BR_STATE_LEARNING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay != 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } /* called under bridge lock */ void br_port_state_selection(struct net_bridge *br) { struct net_bridge_port *p; unsigned int liveports = 0; list_for_each_entry(p, &br->port_list, list) { if (p->state == BR_STATE_DISABLED) continue; /* Don't change port states if userspace is handling STP */ if (br->stp_enabled != BR_USER_STP) { if (p->port_no == br->root_port) { p->config_pending = 0; p->topology_change_ack = 0; br_make_forwarding(p); } else if (br_is_designated_port(p)) { del_timer(&p->message_age_timer); br_make_forwarding(p); } else { p->config_pending = 0; p->topology_change_ack = 0; br_make_blocking(p); } } if (p->state != BR_STATE_BLOCKING) br_multicast_enable_port(p); /* Multicast is not disabled for the port when it goes in * blocking state because the timers will expire and stop by * themselves without sending more queries. */ if (p->state == BR_STATE_FORWARDING) ++liveports; } if (liveports == 0) netif_carrier_off(br->dev); else netif_carrier_on(br->dev); } /* called under bridge lock */ static void br_topology_change_acknowledge(struct net_bridge_port *p) { p->topology_change_ack = 1; br_transmit_config(p); } /* called under bridge lock */ void br_received_config_bpdu(struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { struct net_bridge *br; int was_root; p->stp_xstats.rx_bpdu++; br = p->br; was_root = br_is_root_bridge(br); if (br_supersedes_port_info(p, bpdu)) { br_record_config_information(p, bpdu); br_configuration_update(br); br_port_state_selection(br); if (!br_is_root_bridge(br) && was_root) { del_timer(&br->hello_timer); if (br->topology_change_detected) { del_timer(&br->topology_change_timer); br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } } if (p->port_no == br->root_port) { br_record_config_timeout_values(br, bpdu); br_config_bpdu_generation(br); if (bpdu->topology_change_ack) br_topology_change_acknowledged(br); } } else if (br_is_designated_port(p)) { br_reply(p); } } /* called under bridge lock */ void br_received_tcn_bpdu(struct net_bridge_port *p) { p->stp_xstats.rx_tcn++; if (br_is_designated_port(p)) { br_info(p->br, "port %u(%s) received tcn bpdu\n", (unsigned int) p->port_no, p->dev->name); br_topology_change_detection(p->br); br_topology_change_acknowledge(p); } } /* Change bridge STP parameter */ int br_set_hello_time(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME) return -ERANGE; spin_lock_bh(&br->lock); br->bridge_hello_time = t; if (br_is_root_bridge(br)) br->hello_time = br->bridge_hello_time; spin_unlock_bh(&br->lock); return 0; } int br_set_max_age(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE) return -ERANGE; spin_lock_bh(&br->lock); br->bridge_max_age = t; if (br_is_root_bridge(br)) br->max_age = br->bridge_max_age; spin_unlock_bh(&br->lock); return 0; } /* called under bridge lock */ int __set_ageing_time(struct net_device *dev, unsigned long t) { struct switchdev_attr attr = { .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER, .u.ageing_time = jiffies_to_clock_t(t), }; int err; err = switchdev_port_attr_set(dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; return 0; } /* Set time interval that dynamic forwarding entries live * For pure software bridge, allow values outside the 802.1 * standard specification for special cases: * 0 - entry never ages (all permanent) * 1 - entry disappears (no persistence) * * Offloaded switch entries maybe more restrictive */ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) { unsigned long t = clock_t_to_jiffies(ageing_time); int err; err = __set_ageing_time(br->dev, t); if (err) return err; spin_lock_bh(&br->lock); br->bridge_ageing_time = t; br->ageing_time = t; spin_unlock_bh(&br->lock); mod_delayed_work(system_long_wq, &br->gc_work, 0); return 0; } clock_t br_get_ageing_time(const struct net_device *br_dev) { const struct net_bridge *br; if (!netif_is_bridge_master(br_dev)) return 0; br = netdev_priv(br_dev); return jiffies_to_clock_t(br->ageing_time); } EXPORT_SYMBOL_GPL(br_get_ageing_time); /* called under bridge lock */ void __br_set_topology_change(struct net_bridge *br, unsigned char val) { unsigned long t; int err; if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) { /* On topology change, set the bridge ageing time to twice the * forward delay. Otherwise, restore its default ageing time. */ if (val) { t = 2 * br->forward_delay; br_debug(br, "decreasing ageing time to %lu\n", t); } else { t = br->bridge_ageing_time; br_debug(br, "restoring ageing time to %lu\n", t); } err = __set_ageing_time(br->dev, t); if (err) br_warn(br, "error offloading ageing time\n"); else br->ageing_time = t; } br->topology_change = val; } void __br_set_forward_delay(struct net_bridge *br, unsigned long t) { br->bridge_forward_delay = t; if (br_is_root_bridge(br)) br->forward_delay = br->bridge_forward_delay; } int br_set_forward_delay(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); int err = -ERANGE; spin_lock_bh(&br->lock); if (br->stp_enabled != BR_NO_STP && (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)) goto unlock; __br_set_forward_delay(br, t); err = 0; unlock: spin_unlock_bh(&br->lock); return err; }
40 57 56 8 8 57 57 40 40 29 12 12 53 53 53 56 57 1876 1884 1877 1340 14 12 1869 1882 1882 38 37 38 38 38 37 38 36 36 36 36 36 36 36 36 36 36 38 14 38 14 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 36 10 38 10 10 10 10 10 10 10 10 10 10 10 10 3 8 8 3 10 10 9 9 9 38 38 38 4 38 38 38 38 38 38 37 38 38 38 38 38 38 38 38 38 37 37 15 15 15 15 15 15 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/domain.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/binfmts.h> #include <linux/slab.h> #include <linux/rculist.h> /* Variables definitions.*/ /* The initial domain. */ struct tomoyo_domain_info tomoyo_kernel_domain; /** * tomoyo_update_policy - Update an entry for exception policy. * * @new_entry: Pointer to "struct tomoyo_acl_info". * @size: Size of @new_entry in bytes. * @param: Pointer to "struct tomoyo_acl_param". * @check_duplicate: Callback function to find duplicated entry. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate)(const struct tomoyo_acl_head *, const struct tomoyo_acl_head *)) { int error = param->is_delete ? -ENOENT : -ENOMEM; struct tomoyo_acl_head *entry; struct list_head *list = param->list; if (mutex_lock_interruptible(&tomoyo_policy_lock)) return -ENOMEM; list_for_each_entry_rcu(entry, list, list, srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!check_duplicate(entry, new_entry)) continue; entry->is_deleted = param->is_delete; error = 0; break; } if (error && !param->is_delete) { entry = tomoyo_commit_ok(new_entry, size); if (entry) { list_add_tail_rcu(&entry->list, list); error = 0; } } mutex_unlock(&tomoyo_policy_lock); return error; } /** * tomoyo_same_acl_head - Check for duplicated "struct tomoyo_acl_info" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { return a->type == b->type && a->cond == b->cond; } /** * tomoyo_update_domain - Update an entry for domain policy. * * @new_entry: Pointer to "struct tomoyo_acl_info". * @size: Size of @new_entry in bytes. * @param: Pointer to "struct tomoyo_acl_param". * @check_duplicate: Callback function to find duplicated entry. * @merge_duplicate: Callback function to merge duplicated entry. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size, struct tomoyo_acl_param *param, bool (*check_duplicate)(const struct tomoyo_acl_info *, const struct tomoyo_acl_info *), bool (*merge_duplicate)(struct tomoyo_acl_info *, struct tomoyo_acl_info *, const bool)) { const bool is_delete = param->is_delete; int error = is_delete ? -ENOENT : -ENOMEM; struct tomoyo_acl_info *entry; struct list_head * const list = param->list; if (param->data[0]) { new_entry->cond = tomoyo_get_condition(param); if (!new_entry->cond) return -EINVAL; /* * Domain transition preference is allowed for only * "file execute" entries. */ if (new_entry->cond->transit && !(new_entry->type == TOMOYO_TYPE_PATH_ACL && container_of(new_entry, struct tomoyo_path_acl, head) ->perm == 1 << TOMOYO_TYPE_EXECUTE)) goto out; } if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; list_for_each_entry_rcu(entry, list, list, srcu_read_lock_held(&tomoyo_ss)) { if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS) continue; if (!tomoyo_same_acl_head(entry, new_entry) || !check_duplicate(entry, new_entry)) continue; if (merge_duplicate) entry->is_deleted = merge_duplicate(entry, new_entry, is_delete); else entry->is_deleted = is_delete; error = 0; break; } if (error && !is_delete) { entry = tomoyo_commit_ok(new_entry, size); if (entry) { list_add_tail_rcu(&entry->list, list); error = 0; } } mutex_unlock(&tomoyo_policy_lock); out: tomoyo_put_condition(new_entry->cond); return error; } /** * tomoyo_check_acl - Do permission check. * * @r: Pointer to "struct tomoyo_request_info". * @check_entry: Callback function to check type specific parameters. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ void tomoyo_check_acl(struct tomoyo_request_info *r, bool (*check_entry)(struct tomoyo_request_info *, const struct tomoyo_acl_info *)) { const struct tomoyo_domain_info *domain = r->domain; struct tomoyo_acl_info *ptr; const struct list_head *list = &domain->acl_info_list; u16 i = 0; retry: list_for_each_entry_rcu(ptr, list, list, srcu_read_lock_held(&tomoyo_ss)) { if (ptr->is_deleted || ptr->type != r->param_type) continue; if (!check_entry(r, ptr)) continue; if (!tomoyo_condition(r, ptr->cond)) continue; r->matched_acl = ptr; r->granted = true; return; } for (; i < TOMOYO_MAX_ACL_GROUPS; i++) { if (!test_bit(i, domain->group)) continue; list = &domain->ns->acl_group[i++]; goto retry; } r->granted = false; } /* The list for "struct tomoyo_domain_info". */ LIST_HEAD(tomoyo_domain_list); /** * tomoyo_last_word - Get last component of a domainname. * * @name: Domainname to check. * * Returns the last word of @domainname. */ static const char *tomoyo_last_word(const char *name) { const char *cp = strrchr(name, ' '); if (cp) return cp + 1; return name; } /** * tomoyo_same_transition_control - Check for duplicated "struct tomoyo_transition_control" entry. * * @a: Pointer to "struct tomoyo_acl_head". * @b: Pointer to "struct tomoyo_acl_head". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a, const struct tomoyo_acl_head *b) { const struct tomoyo_transition_control *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_transition_control *p2 = container_of(b, typeof(*p2), head); return p1->type == p2->type && p1->is_last_name == p2->is_last_name && p1->domainname == p2->domainname && p1->program == p2->program; } /** * tomoyo_write_transition_control - Write "struct tomoyo_transition_control" list. * * @param: Pointer to "struct tomoyo_acl_param". * @type: Type of this entry. * * Returns 0 on success, negative value otherwise. */ int tomoyo_write_transition_control(struct tomoyo_acl_param *param, const u8 type) { struct tomoyo_transition_control e = { .type = type }; int error = param->is_delete ? -ENOENT : -ENOMEM; char *program = param->data; char *domainname = strstr(program, " from "); if (domainname) { *domainname = '\0'; domainname += 6; } else if (type == TOMOYO_TRANSITION_CONTROL_NO_KEEP || type == TOMOYO_TRANSITION_CONTROL_KEEP) { domainname = program; program = NULL; } if (program && strcmp(program, "any")) { if (!tomoyo_correct_path(program)) return -EINVAL; e.program = tomoyo_get_name(program); if (!e.program) goto out; } if (domainname && strcmp(domainname, "any")) { if (!tomoyo_correct_domain(domainname)) { if (!tomoyo_correct_path(domainname)) goto out; e.is_last_name = true; } e.domainname = tomoyo_get_name(domainname); if (!e.domainname) goto out; } param->list = &param->ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL]; error = tomoyo_update_policy(&e.head, sizeof(e), param, tomoyo_same_transition_control); out: tomoyo_put_name(e.domainname); tomoyo_put_name(e.program); return error; } /** * tomoyo_scan_transition - Try to find specific domain transition type. * * @list: Pointer to "struct list_head". * @domainname: The name of current domain. * @program: The name of requested program. * @last_name: The last component of @domainname. * @type: One of values in "enum tomoyo_transition_type". * * Returns true if found one, false otherwise. * * Caller holds tomoyo_read_lock(). */ static inline bool tomoyo_scan_transition (const struct list_head *list, const struct tomoyo_path_info *domainname, const struct tomoyo_path_info *program, const char *last_name, const enum tomoyo_transition_type type) { const struct tomoyo_transition_control *ptr; list_for_each_entry_rcu(ptr, list, head.list, srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || ptr->type != type) continue; if (ptr->domainname) { if (!ptr->is_last_name) { if (ptr->domainname != domainname) continue; } else { /* * Use direct strcmp() since this is * unlikely used. */ if (strcmp(ptr->domainname->name, last_name)) continue; } } if (ptr->program && tomoyo_pathcmp(ptr->program, program)) continue; return true; } return false; } /** * tomoyo_transition_type - Get domain transition type. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @domainname: The name of current domain. * @program: The name of requested program. * * Returns TOMOYO_TRANSITION_CONTROL_TRANSIT if executing @program causes * domain transition across namespaces, TOMOYO_TRANSITION_CONTROL_INITIALIZE if * executing @program reinitializes domain transition within that namespace, * TOMOYO_TRANSITION_CONTROL_KEEP if executing @program stays at @domainname , * others otherwise. * * Caller holds tomoyo_read_lock(). */ static enum tomoyo_transition_type tomoyo_transition_type (const struct tomoyo_policy_namespace *ns, const struct tomoyo_path_info *domainname, const struct tomoyo_path_info *program) { const char *last_name = tomoyo_last_word(domainname->name); enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET; while (type < TOMOYO_MAX_TRANSITION_TYPE) { const struct list_head * const list = &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL]; if (!tomoyo_scan_transition(list, domainname, program, last_name, type)) { type++; continue; } if (type != TOMOYO_TRANSITION_CONTROL_NO_RESET && type != TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE) break; /* * Do not check for reset_domain if no_reset_domain matched. * Do not check for initialize_domain if no_initialize_domain * matched. */ type++; type++; } return type; } /** * tomoyo_same_aggregator - Check for duplicated "struct tomoyo_aggregator" entry. * * @a: Pointer to "struct tomoyo_acl_head". * @b: Pointer to "struct tomoyo_acl_head". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a, const struct tomoyo_acl_head *b) { const struct tomoyo_aggregator *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2), head); return p1->original_name == p2->original_name && p1->aggregated_name == p2->aggregated_name; } /** * tomoyo_write_aggregator - Write "struct tomoyo_aggregator" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_aggregator(struct tomoyo_acl_param *param) { struct tomoyo_aggregator e = { }; int error = param->is_delete ? -ENOENT : -ENOMEM; const char *original_name = tomoyo_read_token(param); const char *aggregated_name = tomoyo_read_token(param); if (!tomoyo_correct_word(original_name) || !tomoyo_correct_path(aggregated_name)) return -EINVAL; e.original_name = tomoyo_get_name(original_name); e.aggregated_name = tomoyo_get_name(aggregated_name); if (!e.original_name || !e.aggregated_name || e.aggregated_name->is_patterned) /* No patterns allowed. */ goto out; param->list = &param->ns->policy_list[TOMOYO_ID_AGGREGATOR]; error = tomoyo_update_policy(&e.head, sizeof(e), param, tomoyo_same_aggregator); out: tomoyo_put_name(e.original_name); tomoyo_put_name(e.aggregated_name); return error; } /** * tomoyo_find_namespace - Find specified namespace. * * @name: Name of namespace to find. * @len: Length of @name. * * Returns pointer to "struct tomoyo_policy_namespace" if found, * NULL otherwise. * * Caller holds tomoyo_read_lock(). */ static struct tomoyo_policy_namespace *tomoyo_find_namespace (const char *name, const unsigned int len) { struct tomoyo_policy_namespace *ns; list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) { if (strncmp(name, ns->name, len) || (name[len] && name[len] != ' ')) continue; return ns; } return NULL; } /** * tomoyo_assign_namespace - Create a new namespace. * * @domainname: Name of namespace to create. * * Returns pointer to "struct tomoyo_policy_namespace" on success, * NULL otherwise. * * Caller holds tomoyo_read_lock(). */ struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname) { struct tomoyo_policy_namespace *ptr; struct tomoyo_policy_namespace *entry; const char *cp = domainname; unsigned int len = 0; while (*cp && *cp++ != ' ') len++; ptr = tomoyo_find_namespace(domainname, len); if (ptr) return ptr; if (len >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_domain_def(domainname)) return NULL; entry = kzalloc(sizeof(*entry) + len + 1, GFP_NOFS | __GFP_NOWARN); if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; ptr = tomoyo_find_namespace(domainname, len); if (!ptr && tomoyo_memory_ok(entry)) { char *name = (char *) (entry + 1); ptr = entry; memmove(name, domainname, len); name[len] = '\0'; entry->name = name; tomoyo_init_policy_namespace(entry); entry = NULL; } mutex_unlock(&tomoyo_policy_lock); out: kfree(entry); return ptr; } /** * tomoyo_namespace_jump - Check for namespace jump. * * @domainname: Name of domain. * * Returns true if namespace differs, false otherwise. */ static bool tomoyo_namespace_jump(const char *domainname) { const char *namespace = tomoyo_current_namespace()->name; const int len = strlen(namespace); return strncmp(domainname, namespace, len) || (domainname[len] && domainname[len] != ' '); } /** * tomoyo_assign_domain - Create a domain or a namespace. * * @domainname: The name of domain. * @transit: True if transit to domain found or created. * * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise. * * Caller holds tomoyo_read_lock(). */ struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname, const bool transit) { struct tomoyo_domain_info e = { }; struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname); bool created = false; if (entry) { if (transit) { /* * Since namespace is created at runtime, profiles may * not be created by the moment the process transits to * that domain. Do not perform domain transition if * profile for that domain is not yet created. */ if (tomoyo_policy_loaded && !entry->ns->profile_ptr[entry->profile]) return NULL; } return entry; } /* Requested domain does not exist. */ /* Don't create requested domain if domainname is invalid. */ if (strlen(domainname) >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_correct_domain(domainname)) return NULL; /* * Since definition of profiles and acl_groups may differ across * namespaces, do not inherit "use_profile" and "use_group" settings * by automatically creating requested domain upon domain transition. */ if (transit && tomoyo_namespace_jump(domainname)) return NULL; e.ns = tomoyo_assign_namespace(domainname); if (!e.ns) return NULL; /* * "use_profile" and "use_group" settings for automatically created * domains are inherited from current domain. These are 0 for manually * created domains. */ if (transit) { const struct tomoyo_domain_info *domain = tomoyo_domain(); e.profile = domain->profile; memcpy(e.group, domain->group, sizeof(e.group)); } e.domainname = tomoyo_get_name(domainname); if (!e.domainname) return NULL; if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; entry = tomoyo_find_domain(domainname); if (!entry) { entry = tomoyo_commit_ok(&e, sizeof(e)); if (entry) { INIT_LIST_HEAD(&entry->acl_info_list); list_add_tail_rcu(&entry->list, &tomoyo_domain_list); created = true; } } mutex_unlock(&tomoyo_policy_lock); out: tomoyo_put_name(e.domainname); if (entry && transit) { if (created) { struct tomoyo_request_info r; int i; tomoyo_init_request_info(&r, entry, TOMOYO_MAC_FILE_EXECUTE); r.granted = false; tomoyo_write_log(&r, "use_profile %u\n", entry->profile); for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++) if (test_bit(i, entry->group)) tomoyo_write_log(&r, "use_group %u\n", i); tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES); } } return entry; } /** * tomoyo_environ - Check permission for environment variable names. * * @ee: Pointer to "struct tomoyo_execve". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_environ(struct tomoyo_execve *ee) { struct tomoyo_request_info *r = &ee->r; struct linux_binprm *bprm = ee->bprm; /* env_page.data is allocated by tomoyo_dump_page(). */ struct tomoyo_page_dump env_page = { }; char *arg_ptr; /* Size is TOMOYO_EXEC_TMPSIZE bytes */ int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; int error = -ENOMEM; ee->r.type = TOMOYO_MAC_ENVIRON; ee->r.profile = r->domain->profile; ee->r.mode = tomoyo_get_mode(r->domain->ns, ee->r.profile, TOMOYO_MAC_ENVIRON); if (!r->mode || !envp_count) return 0; arg_ptr = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS); if (!arg_ptr) goto out; while (error == -ENOMEM) { if (!tomoyo_dump_page(bprm, pos, &env_page)) goto out; pos += PAGE_SIZE - offset; /* Read. */ while (argv_count && offset < PAGE_SIZE) { if (!env_page.data[offset++]) argv_count--; } if (argv_count) { offset = 0; continue; } while (offset < PAGE_SIZE) { const unsigned char c = env_page.data[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '=') { arg_ptr[arg_len++] = '\0'; } else if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; if (tomoyo_env_perm(r, arg_ptr)) { error = -EPERM; break; } if (!--envp_count) { error = 0; break; } arg_len = 0; } offset = 0; } out: if (r->mode != TOMOYO_CONFIG_ENFORCING) error = 0; kfree(env_page.data); kfree(arg_ptr); return error; } /** * tomoyo_find_next_domain - Find a domain. * * @bprm: Pointer to "struct linux_binprm". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_find_next_domain(struct linux_binprm *bprm) { struct tomoyo_domain_info *old_domain = tomoyo_domain(); struct tomoyo_domain_info *domain = NULL; const char *original_name = bprm->filename; int retval = -ENOMEM; bool reject_on_transition_failure = false; const struct tomoyo_path_info *candidate; struct tomoyo_path_info exename; struct tomoyo_execve *ee = kzalloc(sizeof(*ee), GFP_NOFS); if (!ee) return -ENOMEM; ee->tmp = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS); if (!ee->tmp) { kfree(ee); return -ENOMEM; } /* ee->dump->data is allocated by tomoyo_dump_page(). */ tomoyo_init_request_info(&ee->r, NULL, TOMOYO_MAC_FILE_EXECUTE); ee->r.ee = ee; ee->bprm = bprm; ee->r.obj = &ee->obj; ee->obj.path1 = bprm->file->f_path; /* Get symlink's pathname of program. */ exename.name = tomoyo_realpath_nofollow(original_name); if (!exename.name) { /* Fallback to realpath if symlink's pathname does not exist. */ exename.name = tomoyo_realpath_from_path(&bprm->file->f_path); if (!exename.name) goto out; } tomoyo_fill_path_info(&exename); retry: /* Check 'aggregator' directive. */ { struct tomoyo_aggregator *ptr; struct list_head *list = &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR]; /* Check 'aggregator' directive. */ candidate = &exename; list_for_each_entry_rcu(ptr, list, head.list, srcu_read_lock_held(&tomoyo_ss)) { if (ptr->head.is_deleted || !tomoyo_path_matches_pattern(&exename, ptr->original_name)) continue; candidate = ptr->aggregated_name; break; } } /* Check execute permission. */ retval = tomoyo_execute_permission(&ee->r, candidate); if (retval == TOMOYO_RETRY_REQUEST) goto retry; if (retval < 0) goto out; /* * To be able to specify domainnames with wildcards, use the * pathname specified in the policy (which may contain * wildcard) rather than the pathname passed to execve() * (which never contains wildcard). */ if (ee->r.param.path.matched_path) candidate = ee->r.param.path.matched_path; /* * Check for domain transition preference if "file execute" matched. * If preference is given, make execve() fail if domain transition * has failed, for domain transition preference should be used with * destination domain defined. */ if (ee->transition) { const char *domainname = ee->transition->name; reject_on_transition_failure = true; if (!strcmp(domainname, "keep")) goto force_keep_domain; if (!strcmp(domainname, "child")) goto force_child_domain; if (!strcmp(domainname, "reset")) goto force_reset_domain; if (!strcmp(domainname, "initialize")) goto force_initialize_domain; if (!strcmp(domainname, "parent")) { char *cp; strscpy(ee->tmp, old_domain->domainname->name, TOMOYO_EXEC_TMPSIZE); cp = strrchr(ee->tmp, ' '); if (cp) *cp = '\0'; } else if (*domainname == '<') strscpy(ee->tmp, domainname, TOMOYO_EXEC_TMPSIZE); else snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s", old_domain->domainname->name, domainname); goto force_jump_domain; } /* * No domain transition preference specified. * Calculate domain to transit to. */ switch (tomoyo_transition_type(old_domain->ns, old_domain->domainname, candidate)) { case TOMOYO_TRANSITION_CONTROL_RESET: force_reset_domain: /* Transit to the root of specified namespace. */ snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>", candidate->name); /* * Make execve() fail if domain transition across namespaces * has failed. */ reject_on_transition_failure = true; break; case TOMOYO_TRANSITION_CONTROL_INITIALIZE: force_initialize_domain: /* Transit to the child of current namespace's root. */ snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s", old_domain->ns->name, candidate->name); break; case TOMOYO_TRANSITION_CONTROL_KEEP: force_keep_domain: /* Keep current domain. */ domain = old_domain; break; default: if (old_domain == &tomoyo_kernel_domain && !tomoyo_policy_loaded) { /* * Needn't to transit from kernel domain before * starting /sbin/init. But transit from kernel domain * if executing initializers because they might start * before /sbin/init. */ domain = old_domain; break; } force_child_domain: /* Normal domain transition. */ snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s", old_domain->domainname->name, candidate->name); break; } force_jump_domain: if (!domain) domain = tomoyo_assign_domain(ee->tmp, true); if (domain) retval = 0; else if (reject_on_transition_failure) { pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp); retval = -ENOMEM; } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING) retval = -ENOMEM; else { retval = 0; if (!old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED]) { old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED] = true; ee->r.granted = false; tomoyo_write_log(&ee->r, "%s", tomoyo_dif [TOMOYO_DIF_TRANSITION_FAILED]); pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp); } } out: if (!domain) domain = old_domain; /* Update reference count on "struct tomoyo_domain_info". */ { struct tomoyo_task *s = tomoyo_task(current); s->old_domain_info = s->domain_info; s->domain_info = domain; atomic_inc(&domain->users); } kfree(exename.name); if (!retval) { ee->r.domain = domain; retval = tomoyo_environ(ee); } kfree(ee->tmp); kfree(ee->dump.data); kfree(ee); return retval; } /** * tomoyo_dump_page - Dump a page to buffer. * * @bprm: Pointer to "struct linux_binprm". * @pos: Location to dump. * @dump: Pointer to "struct tomoyo_page_dump". * * Returns true on success, false otherwise. */ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, struct tomoyo_page_dump *dump) { struct page *page; #ifdef CONFIG_MMU int ret; #endif /* dump->data is released by tomoyo_find_next_domain(). */ if (!dump->data) { dump->data = kzalloc(PAGE_SIZE, GFP_NOFS); if (!dump->data) return false; } /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */ #ifdef CONFIG_MMU /* * This is called at execve() time in order to dig around * in the argv/environment of the new proceess * (represented by bprm). */ mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, FOLL_FORCE, &page, NULL); mmap_read_unlock(bprm->mm); if (ret <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; #endif if (page != dump->page) { const unsigned int offset = pos % PAGE_SIZE; /* * Maybe kmap()/kunmap() should be used here. * But remove_arg_zero() uses kmap_atomic()/kunmap_atomic(). * So do I. */ char *kaddr = kmap_atomic(page); dump->page = page; memcpy(dump->data + offset, kaddr + offset, PAGE_SIZE - offset); kunmap_atomic(kaddr); } /* Same with put_arg_page(page) in fs/exec.c */ #ifdef CONFIG_MMU put_page(page); #endif return true; }
8 8 8 8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2020 Anna Schumaker <Anna.Schumaker@Netapp.com> */ #include <linux/sunrpc/clnt.h> #include <linux/kobject.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/xprtsock.h> #include "sysfs.h" struct xprt_addr { const char *addr; struct rcu_head rcu; }; static void free_xprt_addr(struct rcu_head *head) { struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu); kfree(addr->addr); kfree(addr); } static struct kset *rpc_sunrpc_kset; static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj; static void rpc_sysfs_object_release(struct kobject *kobj) { kfree(kobj); } static const struct kobj_ns_type_operations * rpc_sysfs_object_child_ns_type(const struct kobject *kobj) { return &net_ns_type_operations; } static const struct kobj_type rpc_sysfs_object_type = { .release = rpc_sysfs_object_release, .sysfs_ops = &kobj_sysfs_ops, .child_ns_type = rpc_sysfs_object_child_ns_type, }; static struct kobject *rpc_sysfs_object_alloc(const char *name, struct kset *kset, struct kobject *parent) { struct kobject *kobj; kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); if (kobj) { kobj->kset = kset; if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, parent, "%s", name) == 0) return kobj; kobject_put(kobj); } return NULL; } static inline struct rpc_xprt * rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) { struct rpc_sysfs_xprt *x = container_of(kobj, struct rpc_sysfs_xprt, kobject); return xprt_get(x->xprt); } static inline struct rpc_xprt_switch * rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj) { struct rpc_sysfs_xprt *x = container_of(kobj, struct rpc_sysfs_xprt, kobject); return xprt_switch_get(x->xprt_switch); } static inline struct rpc_xprt_switch * rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) { struct rpc_sysfs_xprt_switch *x = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); return xprt_switch_get(x->xprt_switch); } static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; if (!xprt) { ret = sprintf(buf, "<closed>\n"); goto out; } ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); xprt_put(xprt); out: return ret; } static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); size_t buflen = PAGE_SIZE; ssize_t ret; if (!xprt || !xprt_connected(xprt)) { ret = sprintf(buf, "<closed>\n"); } else if (xprt->ops->get_srcaddr) { ret = xprt->ops->get_srcaddr(xprt, buf, buflen); if (ret > 0) { if (ret < buflen - 1) { buf[ret] = '\n'; ret++; buf[ret] = '\0'; } } else ret = sprintf(buf, "<closed>\n"); } else ret = sprintf(buf, "<not a socket>\n"); xprt_put(xprt); return ret; } static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); unsigned short srcport = 0; size_t buflen = PAGE_SIZE; ssize_t ret; if (!xprt || !xprt_connected(xprt)) { ret = sprintf(buf, "<closed>\n"); goto out; } if (xprt->ops->get_srcport) srcport = xprt->ops->get_srcport(xprt); ret = snprintf(buf, buflen, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" "tasks_queuelen=%ld\ndst_port=%s\n", xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, xprt->pending.qlen, xprt->backlog.qlen, xprt->main, srcport, atomic_long_read(&xprt->queuelen), xprt->address_strings[RPC_DISPLAY_PORT]); out: xprt_put(xprt); return ret; } static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; int locked, connected, connecting, close_wait, bound, binding, closing, congested, cwnd_wait, write_space, offline, remove; if (!(xprt && xprt->state)) { ret = sprintf(buf, "state=CLOSED\n"); } else { locked = test_bit(XPRT_LOCKED, &xprt->state); connected = test_bit(XPRT_CONNECTED, &xprt->state); connecting = test_bit(XPRT_CONNECTING, &xprt->state); close_wait = test_bit(XPRT_CLOSE_WAIT, &xprt->state); bound = test_bit(XPRT_BOUND, &xprt->state); binding = test_bit(XPRT_BINDING, &xprt->state); closing = test_bit(XPRT_CLOSING, &xprt->state); congested = test_bit(XPRT_CONGESTED, &xprt->state); cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); offline = test_bit(XPRT_OFFLINE, &xprt->state); remove = test_bit(XPRT_REMOVE, &xprt->state); ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n", locked ? "LOCKED" : "", connected ? "CONNECTED" : "", connecting ? "CONNECTING" : "", close_wait ? "CLOSE_WAIT" : "", bound ? "BOUND" : "", binding ? "BOUNDING" : "", closing ? "CLOSING" : "", congested ? "CONGESTED" : "", cwnd_wait ? "CWND_WAIT" : "", write_space ? "WRITE_SPACE" : "", offline ? "OFFLINE" : "", remove ? "REMOVE" : ""); } xprt_put(xprt); return ret; } static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct rpc_xprt_switch *xprt_switch = rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); ssize_t ret; if (!xprt_switch) return 0; ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n" "num_unique_destaddr=%u\nqueue_len=%ld\n", xprt_switch->xps_nxprts, xprt_switch->xps_nactive, xprt_switch->xps_nunique_destaddr_xprts, atomic_long_read(&xprt_switch->xps_queuelen)); xprt_switch_put(xprt_switch); return ret; } static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); struct sockaddr *saddr; char *dst_addr; int port; struct xprt_addr *saved_addr; size_t buf_len; if (!xprt) return 0; if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP || xprt->xprt_class->ident == XPRT_TRANSPORT_TCP_TLS || xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) { xprt_put(xprt); return -EOPNOTSUPP; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); /* buf_len is the len until the first occurence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); if (!dst_addr) goto out_err; saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); if (!saved_addr) goto out_err_free; saved_addr->addr = rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]); rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr); call_rcu(&saved_addr->rcu, free_xprt_addr); xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr, sizeof(*saddr)); rpc_set_port(saddr, port); xprt_force_disconnect(xprt); out: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); return count; out_err_free: kfree(dst_addr); out_err: count = -ENOMEM; goto out; } static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); int offline = 0, online = 0, remove = 0; struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); if (!xprt || !xps) { count = 0; goto out_put; } if (!strncmp(buf, "offline", 7)) offline = 1; else if (!strncmp(buf, "online", 6)) online = 1; else if (!strncmp(buf, "remove", 6)) remove = 1; else { count = -EINVAL; goto out_put; } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; goto out_put; } if (xprt->main) { count = -EINVAL; goto release_tasks; } if (offline) { xprt_set_offline_locked(xprt, xps); } else if (online) { xprt_set_online_locked(xprt, xps); } else if (remove) { if (test_bit(XPRT_OFFLINE, &xprt->state)) xprt_delete_locked(xprt, xps); else count = -EINVAL; } release_tasks: xprt_release_write(xprt, NULL); out_put: xprt_put(xprt); xprt_switch_put(xps); return count; } int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); if (!rpc_sunrpc_kset) return -ENOMEM; rpc_sunrpc_client_kobj = rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL); if (!rpc_sunrpc_client_kobj) goto err_client; rpc_sunrpc_xprt_switch_kobj = rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL); if (!rpc_sunrpc_xprt_switch_kobj) goto err_switch; return 0; err_switch: kobject_put(rpc_sunrpc_client_kobj); rpc_sunrpc_client_kobj = NULL; err_client: kset_unregister(rpc_sunrpc_kset); rpc_sunrpc_kset = NULL; return -ENOMEM; } static void rpc_sysfs_client_release(struct kobject *kobj) { struct rpc_sysfs_client *c; c = container_of(kobj, struct rpc_sysfs_client, kobject); kfree(c); } static void rpc_sysfs_xprt_switch_release(struct kobject *kobj) { struct rpc_sysfs_xprt_switch *xprt_switch; xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); kfree(xprt_switch); } static void rpc_sysfs_xprt_release(struct kobject *kobj) { struct rpc_sysfs_xprt *xprt; xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject); kfree(xprt); } static const void *rpc_sysfs_client_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_client, kobject)->net; } static const void *rpc_sysfs_xprt_switch_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; } static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_xprt, kobject)->xprt->xprt_net; } static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, 0644, rpc_sysfs_xprt_srcaddr_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, &rpc_sysfs_xprt_srcaddr.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); static const struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_client_namespace, }; static const struct kobj_type rpc_sysfs_xprt_switch_type = { .release = rpc_sysfs_xprt_switch_release, .default_groups = rpc_sysfs_xprt_switch_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_switch_namespace, }; static const struct kobj_type rpc_sysfs_xprt_type = { .release = rpc_sysfs_xprt_release, .default_groups = rpc_sysfs_xprt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_namespace, }; void rpc_sysfs_exit(void) { kobject_put(rpc_sunrpc_client_kobj); kobject_put(rpc_sunrpc_xprt_switch_kobj); kset_unregister(rpc_sunrpc_kset); } static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, struct net *net, int clid) { struct rpc_sysfs_client *p; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type, parent, "clnt-%d", clid) == 0) return p; kobject_put(&p->kobject); } return NULL; } static struct rpc_sysfs_xprt_switch * rpc_sysfs_xprt_switch_alloc(struct kobject *parent, struct rpc_xprt_switch *xprt_switch, struct net *net, gfp_t gfp_flags) { struct rpc_sysfs_xprt_switch *p; p = kzalloc(sizeof(*p), gfp_flags); if (p) { p->net = net; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_switch_type, parent, "switch-%d", xprt_switch->xps_id) == 0) return p; kobject_put(&p->kobject); } return NULL; } static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt *p; p = kzalloc(sizeof(*p), gfp_flags); if (!p) goto out; p->kobject.kset = rpc_sunrpc_kset; if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type, parent, "xprt-%d-%s", xprt->id, xprt->address_strings[RPC_DISPLAY_PROTO]) == 0) return p; kobject_put(&p->kobject); out: return NULL; } void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct rpc_xprt_switch *xprt_switch, struct net *net) { struct rpc_sysfs_client *rpc_client; struct rpc_sysfs_xprt_switch *xswitch = (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; if (!xswitch) return; rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, net, clnt->cl_clid); if (rpc_client) { char name[] = "switch"; int ret; clnt->cl_sysfs = rpc_client; rpc_client->clnt = clnt; rpc_client->xprt_switch = xprt_switch; kobject_uevent(&rpc_client->kobject, KOBJ_ADD); ret = sysfs_create_link_nowarn(&rpc_client->kobject, &xswitch->kobject, name); if (ret) pr_warn("can't create link to %s in sysfs (%d)\n", name, ret); } } void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt_switch *rpc_xprt_switch; struct net *net; if (xprt_switch->xps_net) net = xprt_switch->xps_net; else net = xprt->xprt_net; rpc_xprt_switch = rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj, xprt_switch, net, gfp_flags); if (rpc_xprt_switch) { xprt_switch->xps_sysfs = rpc_xprt_switch; rpc_xprt_switch->xprt_switch = xprt_switch; rpc_xprt_switch->xprt = xprt; kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD); } else { xprt_switch->xps_sysfs = NULL; } } void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags) { struct rpc_sysfs_xprt *rpc_xprt; struct rpc_sysfs_xprt_switch *switch_obj = (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; if (!switch_obj) return; rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags); if (rpc_xprt) { xprt->xprt_sysfs = rpc_xprt; rpc_xprt->xprt = xprt; rpc_xprt->xprt_switch = xprt_switch; kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD); } } void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) { struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; if (rpc_client) { char name[] = "switch"; sysfs_remove_link(&rpc_client->kobject, name); kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE); kobject_del(&rpc_client->kobject); kobject_put(&rpc_client->kobject); clnt->cl_sysfs = NULL; } } void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch) { struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs; if (rpc_xprt_switch) { kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE); kobject_del(&rpc_xprt_switch->kobject); kobject_put(&rpc_xprt_switch->kobject); xprt_switch->xps_sysfs = NULL; } } void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt) { struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs; if (rpc_xprt) { kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE); kobject_del(&rpc_xprt->kobject); kobject_put(&rpc_xprt->kobject); xprt->xprt_sysfs = NULL; } }
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/resource.c * * Copyright (C) 1999 Linus Torvalds * Copyright (C) 1999 Martin Mares <mj@ucw.cz> * * Arbitrary resource management. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/export.h> #include <linux/errno.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/pseudo_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> #include <linux/mm.h> #include <linux/mount.h> #include <linux/resource_ext.h> #include <uapi/linux/magic.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <asm/io.h> struct resource ioport_resource = { .name = "PCI IO", .start = 0, .end = IO_SPACE_LIMIT, .flags = IORESOURCE_IO, }; EXPORT_SYMBOL(ioport_resource); struct resource iomem_resource = { .name = "PCI mem", .start = 0, .end = -1, .flags = IORESOURCE_MEM, }; EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); static struct resource *next_resource(struct resource *p, bool skip_children) { if (!skip_children && p->child) return p->child; while (!p->sibling && p->parent) p = p->parent; return p->sibling; } #define for_each_resource(_root, _p, _skip_children) \ for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children)) #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { struct resource *root = pde_data(file_inode(m->file)); struct resource *p; loff_t l = *pos; read_lock(&resource_lock); for_each_resource(root, p, false) { if (l-- == 0) break; } return p; } static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; (*pos)++; return (void *)next_resource(p, false); } static void r_stop(struct seq_file *m, void *v) __releases(resource_lock) { read_unlock(&resource_lock); } static int r_show(struct seq_file *m, void *v) { struct resource *root = pde_data(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; int depth; for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) { start = r->start; end = r->end; } else { start = end = 0; } seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", width, start, width, end, r->name ? r->name : "<BAD>"); return 0; } static const struct seq_operations resource_op = { .start = r_start, .next = r_next, .stop = r_stop, .show = r_show, }; static int __init ioresources_init(void) { proc_create_seq_data("ioports", 0, NULL, &resource_op, &ioport_resource); proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource); return 0; } __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ static void free_resource(struct resource *res) { /** * If the resource was allocated using memblock early during boot * we'll leak it here: we can only return full pages back to the * buddy and trying to be smart and reusing them eventually in * alloc_resource() overcomplicates resource handling. */ if (res && PageSlab(virt_to_head_page(res))) kfree(res); } static struct resource *alloc_resource(gfp_t flags) { return kzalloc(sizeof(struct resource), flags); } /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { resource_size_t start = new->start; resource_size_t end = new->end; struct resource *tmp, **p; if (end < start) return root; if (start < root->start) return root; if (end > root->end) return root; p = &root->child; for (;;) { tmp = *p; if (!tmp || tmp->start > end) { new->sibling = tmp; *p = new; new->parent = root; return NULL; } p = &tmp->sibling; if (tmp->end < start) continue; return tmp; } } static int __release_resource(struct resource *old, bool release_child) { struct resource *tmp, **p, *chd; p = &old->parent->child; for (;;) { tmp = *p; if (!tmp) break; if (tmp == old) { if (release_child || !(tmp->child)) { *p = tmp->sibling; } else { for (chd = tmp->child;; chd = chd->sibling) { chd->parent = tmp->parent; if (!(chd->sibling)) break; } *p = tmp->child; chd->sibling = tmp->sibling; } old->parent = NULL; return 0; } p = &tmp->sibling; } return -EINVAL; } static void __release_child_resources(struct resource *r) { struct resource *tmp, *p; resource_size_t size; p = r->child; r->child = NULL; while (p) { tmp = p; p = p->sibling; tmp->parent = NULL; tmp->sibling = NULL; __release_child_resources(tmp); printk(KERN_DEBUG "release child resource %pR\n", tmp); /* need to restore size, and keep flags */ size = resource_size(tmp); tmp->start = 0; tmp->end = size - 1; } } void release_child_resources(struct resource *r) { write_lock(&resource_lock); __release_child_resources(r); write_unlock(&resource_lock); } /** * request_resource_conflict - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * * Returns 0 for success, conflict resource on error. */ struct resource *request_resource_conflict(struct resource *root, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __request_resource(root, new); write_unlock(&resource_lock); return conflict; } /** * request_resource - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * * Returns 0 for success, negative error code on error. */ int request_resource(struct resource *root, struct resource *new) { struct resource *conflict; conflict = request_resource_conflict(root, new); return conflict ? -EBUSY : 0; } EXPORT_SYMBOL(request_resource); /** * release_resource - release a previously reserved resource * @old: resource pointer */ int release_resource(struct resource *old) { int retval; write_lock(&resource_lock); retval = __release_resource(old, true); write_unlock(&resource_lock); return retval; } EXPORT_SYMBOL(release_resource); /** * find_next_iomem_res - Finds the lowest iomem resource that covers part of * [@start..@end]. * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have * @desc: descriptor the resource must have * @res: return ptr, if resource found * * The caller must specify @start, @end, @flags, and @desc * (which may be IORES_DESC_NONE). */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, struct resource *res) { struct resource *p; if (!res) return -EINVAL; if (start >= end) return -EINVAL; read_lock(&resource_lock); for_each_resource(&iomem_resource, p, false) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; break; } /* Skip until we find a range that matches what we look for */ if (p->end < start) continue; if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; /* Found a match, break */ break; } if (p) { /* copy data */ *res = (struct resource) { .start = max(start, p->start), .end = min(end, p->end), .flags = p->flags, .desc = p->desc, .parent = p->parent, }; } read_unlock(&resource_lock); return p ? 0 : -ENODEV; } static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, void *arg, int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && !find_next_iomem_res(start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; start = res.end + 1; } return ret; } /** * walk_iomem_res_desc - Walks through iomem resources and calls func() * with matching resource ranges. * * * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check. * @flags: I/O resource flags * @start: start addr * @end: end addr * @arg: function argument for the callback @func * @func: callback function that is called for each qualifying resource area * * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. * * NOTE: For a new descriptor search, define a new IORES_DESC in * <linux/ioport.h> and set it in 'desc' of a target resource entry. */ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { return __walk_iomem_res_desc(start, end, flags, desc, arg, func); } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * Now, this function is only for System RAM, it deals with full ranges and * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, func); } /* * This function, being a variant of walk_system_ram_res(), calls the @func * callback against all memory ranges of type System RAM which are marked as * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from * higher to lower. */ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { struct resource res, *rams; int rams_size = 16, i; unsigned long flags; int ret = -1; /* create a list */ rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL); if (!rams) return ret; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; i = 0; while ((start < end) && (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) { if (i >= rams_size) { /* re-alloc */ struct resource *rams_new; rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource), GFP_KERNEL); if (!rams_new) goto out; rams = rams_new; rams_size += 16; } rams[i++] = res; start = res.end + 1; } /* go reverse */ for (i--; i >= 0; i--) { ret = (*func)(&rams[i], arg); if (ret) break; } out: kvfree(rams); return ret; } /* * This function calls the @func callback against all memory ranges, which * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY. */ int walk_mem_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, func); } /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) { resource_size_t start, end; unsigned long flags; struct resource res; unsigned long pfn, end_pfn; int ret = -EINVAL; start = (u64) start_pfn << PAGE_SHIFT; end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) { pfn = PFN_UP(res.start); end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) break; start = res.end + 1; } return ret; } static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; } /* * This generic page_is_ram() returns true if specified address is * registered as System RAM in iomem_resource list. */ int __weak page_is_ram(unsigned long pfn) { return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } EXPORT_SYMBOL_GPL(page_is_ram); static int __region_intersects(struct resource *parent, resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { resource_size_t ostart, oend; int type = 0; int other = 0; struct resource *p, *dp; bool is_type, covered; struct resource res; res.start = start; res.end = start + size - 1; for (p = parent->child; p ; p = p->sibling) { if (!resource_overlaps(p, &res)) continue; is_type = (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc); if (is_type) { type++; continue; } /* * Continue to search in descendant resources as if the * matched descendant resources cover some ranges of 'p'. * * |------------- "CXL Window 0" ------------| * |-- "System RAM" --| * * will behave similar as the following fake resource * tree when searching "System RAM". * * |-- "System RAM" --||-- "CXL Window 0a" --| */ covered = false; ostart = max(res.start, p->start); oend = min(res.end, p->end); for_each_resource(p, dp, false) { if (!resource_overlaps(dp, &res)) continue; is_type = (dp->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == dp->desc); if (is_type) { type++; /* * Range from 'ostart' to 'dp->start' * isn't covered by matched resource. */ if (dp->start > ostart) break; if (dp->end >= oend) { covered = true; break; } /* Remove covered range */ ostart = max(ostart, dp->end + 1); } } if (!covered) other++; } if (type == 0) return REGION_DISJOINT; if (other == 0) return REGION_INTERSECTS; return REGION_MIXED; } /** * region_intersects() - determine intersection of region with known resources * @start: region start address * @size: size of region * @flags: flags of resource (in iomem_resource) * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE * * Check if the specified region partially overlaps or fully eclipses a * resource identified by @flags and @desc (optional with IORES_DESC_NONE). * Return REGION_DISJOINT if the region does not overlap @flags/@desc, * return REGION_MIXED if the region overlaps @flags/@desc and another * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc * and no other defined resource. Note that REGION_INTERSECTS is also * returned in the case when the specified region overlaps RAM and undefined * memory holes. * * region_intersect() is used by memory remapping functions to ensure * the user is not remapping RAM and is a vast speed up over walking * through the resource table page by page. */ int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { int ret; read_lock(&resource_lock); ret = __region_intersects(&iomem_resource, start, size, flags, desc); read_unlock(&resource_lock); return ret; } EXPORT_SYMBOL_GPL(region_intersects); void __weak arch_remove_reservations(struct resource *avail) { } static void resource_clip(struct resource *res, resource_size_t min, resource_size_t max) { if (res->start < min) res->start = min; if (res->end > max) res->end = max; } /* * Find empty space in the resource tree with the given range and * alignment constraints */ static int __find_resource_space(struct resource *root, struct resource *old, struct resource *new, resource_size_t size, struct resource_constraint *constraint) { struct resource *this = root->child; struct resource tmp = *new, avail, alloc; resource_alignf alignf = constraint->alignf; tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to tmp->end below would cause an underflow. */ if (this && this->start == root->start) { tmp.start = (this == old) ? old->start : this->end + 1; this = this->sibling; } for(;;) { if (this) tmp.end = (this == old) ? this->end : this->start - 1; else tmp.end = root->end; if (tmp.end < tmp.start) goto next; resource_clip(&tmp, constraint->min, constraint->max); arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ avail.start = ALIGN(tmp.start, constraint->align); avail.end = tmp.end; avail.flags = new->flags & ~IORESOURCE_UNSET; if (avail.start >= tmp.start) { alloc.flags = avail.flags; if (alignf) { alloc.start = alignf(constraint->alignf_data, &avail, size, constraint->align); } else { alloc.start = avail.start; } alloc.end = alloc.start + size - 1; if (alloc.start <= alloc.end && resource_contains(&avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; } } next: if (!this || this->end == root->end) break; if (this != old) tmp.start = this->end + 1; this = this->sibling; } return -EBUSY; } /** * find_resource_space - Find empty space in the resource tree * @root: Root resource descriptor * @new: Resource descriptor awaiting an empty resource space * @size: The minimum size of the empty space * @constraint: The range and alignment constraints to be met * * Finds an empty space under @root in the resource tree satisfying range and * alignment @constraints. * * Return: * * %0 - if successful, @new members start, end, and flags are altered. * * %-EBUSY - if no empty space was found. */ int find_resource_space(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint) { return __find_resource_space(root, NULL, new, size, constraint); } EXPORT_SYMBOL_GPL(find_resource_space); /** * reallocate_resource - allocate a slot in the resource tree given range & alignment. * The resource will be relocated if the new size cannot be reallocated in the * current location. * * @root: root resource descriptor * @old: resource descriptor desired by caller * @newsize: new size of the resource descriptor * @constraint: the size and alignment constraints to be met. */ static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, struct resource_constraint *constraint) { int err=0; struct resource new = *old; struct resource *conflict; write_lock(&resource_lock); if ((err = __find_resource_space(root, old, &new, newsize, constraint))) goto out; if (resource_contains(&new, old)) { old->start = new.start; old->end = new.end; goto out; } if (old->child) { err = -EBUSY; goto out; } if (resource_contains(old, &new)) { old->start = new.start; old->end = new.end; } else { __release_resource(old, true); *old = new; conflict = __request_resource(root, old); BUG_ON(conflict); } out: write_unlock(&resource_lock); return err; } /** * allocate_resource - allocate empty slot in the resource tree given range & alignment. * The resource will be reallocated with a new size if it was already allocated * @root: root resource descriptor * @new: resource descriptor desired by caller * @size: requested resource region size * @min: minimum boundary to allocate * @max: maximum boundary to allocate * @align: alignment requested, in bytes * @alignf: alignment function, optional, called if not NULL * @alignf_data: arbitrary data to pass to the @alignf function */ int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, resource_alignf alignf, void *alignf_data) { int err; struct resource_constraint constraint; constraint.min = min; constraint.max = max; constraint.align = align; constraint.alignf = alignf; constraint.alignf_data = alignf_data; if ( new->parent ) { /* resource is already allocated, try reallocating with the new constraints */ return reallocate_resource(root, new, size, &constraint); } write_lock(&resource_lock); err = find_resource_space(root, new, size, &constraint); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); return err; } EXPORT_SYMBOL(allocate_resource); /** * lookup_resource - find an existing resource by a resource start address * @root: root resource descriptor * @start: resource start address * * Returns a pointer to the resource if found, NULL otherwise */ struct resource *lookup_resource(struct resource *root, resource_size_t start) { struct resource *res; read_lock(&resource_lock); for (res = root->child; res; res = res->sibling) { if (res->start == start) break; } read_unlock(&resource_lock); return res; } /* * Insert a resource into the resource tree. If successful, return NULL, * otherwise return the conflicting resource (compare to __request_resource()) */ static struct resource * __insert_resource(struct resource *parent, struct resource *new) { struct resource *first, *next; for (;; parent = first) { first = __request_resource(parent, new); if (!first) return first; if (first == parent) return first; if (WARN_ON(first == new)) /* duplicated insertion */ return first; if ((first->start > new->start) || (first->end < new->end)) break; if ((first->start == new->start) && (first->end == new->end)) break; } for (next = first; ; next = next->sibling) { /* Partial overlap? Bad, and unfixable */ if (next->start < new->start || next->end > new->end) return next; if (!next->sibling) break; if (next->sibling->start > new->end) break; } new->parent = parent; new->sibling = next->sibling; new->child = first; next->sibling = NULL; for (next = first; next; next = next->sibling) next->parent = new; if (parent->child == first) { parent->child = new; } else { next = parent->child; while (next->sibling != first) next = next->sibling; next->sibling = new; } return NULL; } /** * insert_resource_conflict - Inserts resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * * Returns 0 on success, conflict resource if the resource can't be inserted. * * This function is equivalent to request_resource_conflict when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. * * This function is intended for producers of resources, such as FW modules * and bus drivers. */ struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __insert_resource(parent, new); write_unlock(&resource_lock); return conflict; } /** * insert_resource - Inserts a resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * * Returns 0 on success, -EBUSY if the resource can't be inserted. * * This function is intended for producers of resources, such as FW modules * and bus drivers. */ int insert_resource(struct resource *parent, struct resource *new) { struct resource *conflict; conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } EXPORT_SYMBOL_GPL(insert_resource); /** * insert_resource_expand_to_fit - Insert a resource into the resource tree * @root: root resource descriptor * @new: new resource to insert * * Insert a resource into the resource tree, possibly expanding it in order * to make it encompass any conflicting resources. */ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) { if (new->parent) return; write_lock(&resource_lock); for (;;) { struct resource *conflict; conflict = __insert_resource(root, new); if (!conflict) break; if (conflict == root) break; /* Ok, expand resource to cover the conflict, then try again .. */ if (conflict->start < new->start) new->start = conflict->start; if (conflict->end > new->end) new->end = conflict->end; pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); } write_unlock(&resource_lock); } /* * Not for general consumption, only early boot memory map parsing, PCI * resource discovery, and late discovery of CXL resources are expected * to use this interface. The former are built-in and only the latter, * CXL, is a module. */ EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL); /** * remove_resource - Remove a resource in the resource tree * @old: resource to remove * * Returns 0 on success, -EINVAL if the resource is not valid. * * This function removes a resource previously inserted by insert_resource() * or insert_resource_conflict(), and moves the children (if any) up to * where they were before. insert_resource() and insert_resource_conflict() * insert a new resource, and move any conflicting resources down to the * children of the new resource. * * insert_resource(), insert_resource_conflict() and remove_resource() are * intended for producers of resources, such as FW modules and bus drivers. */ int remove_resource(struct resource *old) { int retval; write_lock(&resource_lock); retval = __release_resource(old, false); write_unlock(&resource_lock); return retval; } EXPORT_SYMBOL_GPL(remove_resource); static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; if (!parent) goto skip; if ((start < parent->start) || (end > parent->end)) goto out; if (res->sibling && (res->sibling->start <= end)) goto out; tmp = parent->child; if (tmp != res) { while (tmp->sibling != res) tmp = tmp->sibling; if (start <= tmp->end) goto out; } skip: for (tmp = res->child; tmp; tmp = tmp->sibling) if ((tmp->start < start) || (tmp->end > end)) goto out; res->start = start; res->end = end; result = 0; out: return result; } /** * adjust_resource - modify a resource's start and size * @res: resource to modify * @start: new start value * @size: new size * * Given an existing resource, change its start and size to match the * arguments. Returns 0 on success, -EBUSY if it can't fit. * Existing children of the resource are assumed to be immutable. */ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { int result; write_lock(&resource_lock); result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } EXPORT_SYMBOL(adjust_resource); static void __init __reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { struct resource *parent = root; struct resource *conflict; struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; int type = resource_type(root); if (!res) return; res->name = name; res->start = start; res->end = end; res->flags = type | IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; while (1) { conflict = __request_resource(parent, res); if (!conflict) { if (!next_res) break; res = next_res; next_res = NULL; continue; } /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { free_resource(res); WARN_ON(next_res); break; } /* failed, split and try again */ if (conflict->start > res->start) { end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { free_resource(res); break; } next_res->name = name; next_res->start = conflict->end + 1; next_res->end = end; next_res->flags = type | IORESOURCE_BUSY; next_res->desc = IORES_DESC_NONE; } } else { res->start = conflict->end + 1; } } } void __init reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { int abort = 0; write_lock(&resource_lock); if (root->start > start || root->end < end) { pr_err("requested range [0x%llx-0x%llx] not in root %pr\n", (unsigned long long)start, (unsigned long long)end, root); if (start > root->end || end < root->start) abort = 1; else { if (end > root->end) end = root->end; if (start < root->start) start = root->start; pr_err("fixing request to [0x%llx-0x%llx]\n", (unsigned long long)start, (unsigned long long)end); } dump_stack(); } if (!abort) __reserve_region_with_split(root, start, end, name); write_unlock(&resource_lock); } /** * resource_alignment - calculate resource's alignment * @res: resource pointer * * Returns alignment on success, 0 (invalid alignment) on failure. */ resource_size_t resource_alignment(struct resource *res) { switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { case IORESOURCE_SIZEALIGN: return resource_size(res); case IORESOURCE_STARTALIGN: return res->start; default: return 0; } } /* * This is compatibility stuff for IO resources. * * Note how this, unlike the above, knows about * the IO flag meanings (busy etc). * * request_region creates a new busy region. * * release_region releases a matching busy region. */ static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait); static struct inode *iomem_inode; #ifdef CONFIG_IO_STRICT_DEVMEM static void revoke_iomem(struct resource *res) { /* pairs with smp_store_release() in iomem_init_inode() */ struct inode *inode = smp_load_acquire(&iomem_inode); /* * Check that the initialization has completed. Losing the race * is ok because it means drivers are claiming resources before * the fs_initcall level of init and prevent iomem_get_mapping users * from establishing mappings. */ if (!inode) return; /* * The expectation is that the driver has successfully marked * the resource busy by this point, so devmem_is_allowed() * should start returning false, however for performance this * does not iterate the entire resource range. */ if (devmem_is_allowed(PHYS_PFN(res->start)) && devmem_is_allowed(PHYS_PFN(res->end))) { /* * *cringe* iomem=relaxed says "go ahead, what's the * worst that can happen?" */ return; } unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1); } #else static void revoke_iomem(struct resource *res) {} #endif struct address_space *iomem_get_mapping(void) { /* * This function is only called from file open paths, hence guaranteed * that fs_initcalls have completed and no need to check for NULL. But * since revoke_iomem can be called before the initcall we still need * the barrier to appease checkers. */ return smp_load_acquire(&iomem_inode)->i_mapping; } static int __request_region_locked(struct resource *res, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); res->name = name; res->start = start; res->end = start + n - 1; for (;;) { struct resource *conflict; res->flags = resource_type(parent) | resource_ext_type(parent); res->flags |= IORESOURCE_BUSY | flags; res->desc = parent->desc; conflict = __request_resource(parent, res); if (!conflict) break; /* * mm/hmm.c reserves physical addresses which then * become unavailable to other users. Conflicts are * not expected. Warn to aid debugging if encountered. */ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { pr_warn("Unaddressable device %s %pR conflicts with %pR", conflict->name, conflict, res); } if (conflict != parent) { if (!(conflict->flags & IORESOURCE_BUSY)) { parent = conflict; continue; } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); write_unlock(&resource_lock); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); remove_wait_queue(&muxed_resource_wait, &wait); write_lock(&resource_lock); continue; } /* Uhhuh, that didn't work out.. */ return -EBUSY; } return 0; } /** * __request_region - create a new busy resource region * @parent: parent resource descriptor * @start: resource start address * @n: resource region size * @name: reserving caller's ID string * @flags: IO resource flags */ struct resource *__request_region(struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { struct resource *res = alloc_resource(GFP_KERNEL); int ret; if (!res) return NULL; write_lock(&resource_lock); ret = __request_region_locked(res, parent, start, n, name, flags); write_unlock(&resource_lock); if (ret) { free_resource(res); return NULL; } if (parent == &iomem_resource) revoke_iomem(res); return res; } EXPORT_SYMBOL(__request_region); /** * __release_region - release a previously reserved resource region * @parent: parent resource descriptor * @start: resource start address * @n: resource region size * * The described resource region must match a currently busy region. */ void __release_region(struct resource *parent, resource_size_t start, resource_size_t n) { struct resource **p; resource_size_t end; p = &parent->child; end = start + n - 1; write_lock(&resource_lock); for (;;) { struct resource *res = *p; if (!res) break; if (res->start <= start && res->end >= end) { if (!(res->flags & IORESOURCE_BUSY)) { p = &res->child; continue; } if (res->start != start || res->end != end) break; *p = res->sibling; write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); free_resource(res); return; } p = &res->sibling; } write_unlock(&resource_lock); pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end); } EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region * @start: resource start address * @size: resource region size * * This interface is intended for memory hot-delete. The requested region * is released from a currently busy memory resource. The requested region * must either match exactly or fit into a single busy resource entry. In * the latter case, the remaining resource is adjusted accordingly. * Existing children of the busy memory resource must be immutable in the * request. * * Note: * - Additional release conditions, such as overlapping region, can be * supported after they are confirmed as valid cases. * - When a busy memory resource gets split into two entries, the code * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { struct resource *parent = &iomem_resource; struct resource *new_res = NULL; bool alloc_nofail = false; struct resource **p; struct resource *res; resource_size_t end; end = start + size - 1; if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) return; /* * We free up quite a lot of memory on memory hotunplug (esp., memap), * just before releasing the region. This is highly unlikely to * fail - let's play save and make it never fail as the caller cannot * perform any error handling (e.g., trying to re-add memory will fail * similarly). */ retry: new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); while ((res = *p)) { if (res->start >= end) break; /* look for the next resource if it does not fit into */ if (res->start > start || res->end < end) { p = &res->sibling; continue; } if (!(res->flags & IORESOURCE_MEM)) break; if (!(res->flags & IORESOURCE_BUSY)) { p = &res->child; continue; } /* found the target resource; let's adjust accordingly */ if (res->start == start && res->end == end) { /* free the whole entry */ *p = res->sibling; free_resource(res); } else if (res->start == start && res->end != end) { /* adjust the start */ WARN_ON_ONCE(__adjust_resource(res, end + 1, res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ WARN_ON_ONCE(__adjust_resource(res, res->start, start - res->start)); } else { /* split into two entries - we need a new resource */ if (!new_res) { new_res = alloc_resource(GFP_ATOMIC); if (!new_res) { alloc_nofail = true; write_unlock(&resource_lock); goto retry; } } new_res->name = res->name; new_res->start = end + 1; new_res->end = res->end; new_res->flags = res->flags; new_res->desc = res->desc; new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; if (WARN_ON_ONCE(__adjust_resource(res, res->start, start - res->start))) break; res->sibling = new_res; new_res = NULL; } break; } write_unlock(&resource_lock); free_resource(new_res); } #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_MEMORY_HOTPLUG static bool system_ram_resources_mergeable(struct resource *r1, struct resource *r2) { /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ return r1->flags == r2->flags && r1->end + 1 == r2->start && r1->name == r2->name && r1->desc == r2->desc && !r1->child && !r2->child; } /** * merge_system_ram_resource - mark the System RAM resource mergeable and try to * merge it with adjacent, mergeable resources * @res: resource descriptor * * This interface is intended for memory hotplug, whereby lots of contiguous * system ram resources are added (e.g., via add_memory*()) by a driver, and * the actual resource boundaries are not of interest (e.g., it might be * relevant for DIMMs). Only resources that are marked mergeable, that have the * same parent, and that don't have any children are considered. All mergeable * resources must be immutable during the request. * * Note: * - The caller has to make sure that no pointers to resources that are * marked mergeable are used anymore after this call - the resource might * be freed and the pointer might be stale! * - release_mem_region_adjustable() will split on demand on memory hotunplug */ void merge_system_ram_resource(struct resource *res) { const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; struct resource *cur; if (WARN_ON_ONCE((res->flags & flags) != flags)) return; write_lock(&resource_lock); res->flags |= IORESOURCE_SYSRAM_MERGEABLE; /* Try to merge with next item in the list. */ cur = res->sibling; if (cur && system_ram_resources_mergeable(res, cur)) { res->end = cur->end; res->sibling = cur->sibling; free_resource(cur); } /* Try to merge with previous item in the list. */ cur = res->parent->child; while (cur && cur->sibling != res) cur = cur->sibling; if (cur && system_ram_resources_mergeable(cur, res)) { cur->end = res->end; cur->sibling = res->sibling; free_resource(res); } write_unlock(&resource_lock); } #endif /* CONFIG_MEMORY_HOTPLUG */ /* * Managed region resource */ static void devm_resource_release(struct device *dev, void *ptr) { struct resource **r = ptr; release_resource(*r); } /** * devm_request_resource() - request and reserve an I/O or memory resource * @dev: device for which to request the resource * @root: root of the resource tree from which to request the resource * @new: descriptor of the resource to request * * This is a device-managed version of request_resource(). There is usually * no need to release resources requested by this function explicitly since * that will be taken care of when the device is unbound from its driver. * If for some reason the resource needs to be released explicitly, because * of ordering issues for example, drivers must call devm_release_resource() * rather than the regular release_resource(). * * When a conflict is detected between any existing resources and the newly * requested resource, an error message will be printed. * * Returns 0 on success or a negative error code on failure. */ int devm_request_resource(struct device *dev, struct resource *root, struct resource *new) { struct resource *conflict, **ptr; ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; *ptr = new; conflict = request_resource_conflict(root, new); if (conflict) { dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", new, conflict->name, conflict); devres_free(ptr); return -EBUSY; } devres_add(dev, ptr); return 0; } EXPORT_SYMBOL(devm_request_resource); static int devm_resource_match(struct device *dev, void *res, void *data) { struct resource **ptr = res; return *ptr == data; } /** * devm_release_resource() - release a previously requested resource * @dev: device for which to release the resource * @new: descriptor of the resource to release * * Releases a resource previously requested using devm_request_resource(). */ void devm_release_resource(struct device *dev, struct resource *new) { WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, new)); } EXPORT_SYMBOL(devm_release_resource); struct region_devres { struct resource *parent; resource_size_t start; resource_size_t n; }; static void devm_region_release(struct device *dev, void *res) { struct region_devres *this = res; __release_region(this->parent, this->start, this->n); } static int devm_region_match(struct device *dev, void *res, void *match_data) { struct region_devres *this = res, *match = match_data; return this->parent == match->parent && this->start == match->start && this->n == match->n; } struct resource * __devm_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name) { struct region_devres *dr = NULL; struct resource *res; dr = devres_alloc(devm_region_release, sizeof(struct region_devres), GFP_KERNEL); if (!dr) return NULL; dr->parent = parent; dr->start = start; dr->n = n; res = __request_region(parent, start, n, name, 0); if (res) devres_add(dev, dr); else devres_free(dr); return res; } EXPORT_SYMBOL(__devm_request_region); void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n) { struct region_devres match_data = { parent, start, n }; __release_region(parent, start, n); WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, &match_data)); } EXPORT_SYMBOL(__devm_release_region); /* * Reserve I/O ports or memory based on "reserve=" kernel parameter. */ #define MAXRESERVE 4 static int __init reserve_setup(char *str) { static int reserved; static struct resource reserve[MAXRESERVE]; for (;;) { unsigned int io_start, io_num; int x = reserved; struct resource *parent; if (get_option(&str, &io_start) != 2) break; if (get_option(&str, &io_num) == 0) break; if (x < MAXRESERVE) { struct resource *res = reserve + x; /* * If the region starts below 0x10000, we assume it's * I/O port space; otherwise assume it's memory. */ if (io_start < 0x10000) { res->flags = IORESOURCE_IO; parent = &ioport_resource; } else { res->flags = IORESOURCE_MEM; parent = &iomem_resource; } res->name = "reserved"; res->start = io_start; res->end = io_start + io_num - 1; res->flags |= IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; res->child = NULL; if (request_resource(parent, res) == 0) reserved = x+1; } } return 1; } __setup("reserve=", reserve_setup); /* * Check if the requested addr and size spans more than any slot in the * iomem resource tree. */ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) { resource_size_t end = addr + size - 1; struct resource *p; int err = 0; read_lock(&resource_lock); for_each_resource(&iomem_resource, p, false) { /* * We can probably skip the resources without * IORESOURCE_IO attribute? */ if (p->start > end) continue; if (p->end < addr) continue; if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && PFN_DOWN(p->end) >= PFN_DOWN(end)) continue; /* * if a resource is "BUSY", it's not a hardware resource * but a driver mapping of such a resource; we don't want * to warn for those; some drivers legitimately map only * partial hardware resources. (example: vesafb) */ if (p->flags & IORESOURCE_BUSY) continue; pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n", &addr, &end, p->name, p); err = -1; break; } read_unlock(&resource_lock); return err; } #ifdef CONFIG_STRICT_DEVMEM static int strict_iomem_checks = 1; #else static int strict_iomem_checks; #endif /* * Check if an address is exclusive to the kernel and must not be mapped to * user space, for example, via /dev/mem. * * Returns true if exclusive to the kernel, otherwise returns false. */ bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size) { const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM | IORESOURCE_EXCLUSIVE; bool skip_children = false, err = false; struct resource *p; read_lock(&resource_lock); for_each_resource(root, p, skip_children) { if (p->start >= addr + size) break; if (p->end < addr) { skip_children = true; continue; } skip_children = false; /* * IORESOURCE_SYSTEM_RAM resources are exclusive if * IORESOURCE_EXCLUSIVE is set, even if they * are not busy and even if "iomem=relaxed" is set. The * responsible driver dynamically adds/removes system RAM within * such an area and uncontrolled access is dangerous. */ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) { err = true; break; } /* * A resource is exclusive if IORESOURCE_EXCLUSIVE is set * or CONFIG_IO_STRICT_DEVMEM is enabled and the * resource is busy. */ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY)) continue; if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) || p->flags & IORESOURCE_EXCLUSIVE) { err = true; break; } } read_unlock(&resource_lock); return err; } bool iomem_is_exclusive(u64 addr) { return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK, PAGE_SIZE); } struct resource_entry *resource_list_create_entry(struct resource *res, size_t extra_size) { struct resource_entry *entry; entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL); if (entry) { INIT_LIST_HEAD(&entry->node); entry->res = res ? res : &entry->__res; } return entry; } EXPORT_SYMBOL(resource_list_create_entry); void resource_list_free(struct list_head *head) { struct resource_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, node) resource_list_destroy_entry(entry); } EXPORT_SYMBOL(resource_list_free); #ifdef CONFIG_GET_FREE_REGION #define GFR_DESCENDING (1UL << 0) #define GFR_REQUEST_REGION (1UL << 1) #ifdef PA_SECTION_SHIFT #define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) #else #define GFR_DEFAULT_ALIGN PAGE_SIZE #endif static resource_size_t gfr_start(struct resource *base, resource_size_t size, resource_size_t align, unsigned long flags) { if (flags & GFR_DESCENDING) { resource_size_t end; end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); return end - size + 1; } return ALIGN(max(base->start, align), align); } static bool gfr_continue(struct resource *base, resource_size_t addr, resource_size_t size, unsigned long flags) { if (flags & GFR_DESCENDING) return addr > size && addr >= base->start; /* * In the ascend case be careful that the last increment by * @size did not wrap 0. */ return addr > addr - size && addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); } static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, unsigned long flags) { if (flags & GFR_DESCENDING) return addr - size; return addr + size; } static void remove_free_mem_region(void *_res) { struct resource *res = _res; if (res->parent) remove_resource(res); free_resource(res); } static struct resource * get_free_mem_region(struct device *dev, struct resource *base, resource_size_t size, const unsigned long align, const char *name, const unsigned long desc, const unsigned long flags) { resource_size_t addr; struct resource *res; struct region_devres *dr = NULL; size = ALIGN(size, align); res = alloc_resource(GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); if (dev && (flags & GFR_REQUEST_REGION)) { dr = devres_alloc(devm_region_release, sizeof(struct region_devres), GFP_KERNEL); if (!dr) { free_resource(res); return ERR_PTR(-ENOMEM); } } else if (dev) { if (devm_add_action_or_reset(dev, remove_free_mem_region, res)) return ERR_PTR(-ENOMEM); } write_lock(&resource_lock); for (addr = gfr_start(base, size, align, flags); gfr_continue(base, addr, align, flags); addr = gfr_next(addr, align, flags)) { if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != REGION_DISJOINT) continue; if (flags & GFR_REQUEST_REGION) { if (__request_region_locked(res, &iomem_resource, addr, size, name, 0)) break; if (dev) { dr->parent = &iomem_resource; dr->start = addr; dr->n = size; devres_add(dev, dr); } res->desc = desc; write_unlock(&resource_lock); /* * A driver is claiming this region so revoke any * mappings. */ revoke_iomem(res); } else { res->start = addr; res->end = addr + size - 1; res->name = name; res->desc = desc; res->flags = IORESOURCE_MEM; /* * Only succeed if the resource hosts an exclusive * range after the insert */ if (__insert_resource(base, res) || res->child) break; write_unlock(&resource_lock); } return res; } write_unlock(&resource_lock); if (flags & GFR_REQUEST_REGION) { free_resource(res); devres_free(dr); } else if (dev) devm_release_action(dev, remove_free_mem_region, res); return ERR_PTR(-ERANGE); } /** * devm_request_free_mem_region - find free region for device private memory * * @dev: device struct to bind the resource to * @size: size in bytes of the device memory to add * @base: resource tree to look in * * This function tries to find an empty range of physical address big enough to * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE * memory, which in turn allocates struct pages. */ struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size) { unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN, dev_name(dev), IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(devm_request_free_mem_region); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name) { unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name, IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(request_free_mem_region); /** * alloc_free_mem_region - find a free region relative to @base * @base: resource that will parent the new resource * @size: size in bytes of memory to allocate from @base * @align: alignment requirements for the allocation * @name: resource name * * Buses like CXL, that can dynamically instantiate new memory regions, * need a method to allocate physical address space for those regions. * Allocate and insert a new resource to cover a free, unclaimed by a * descendant of @base, range in the span of @base. */ struct resource *alloc_free_mem_region(struct resource *base, unsigned long size, unsigned long align, const char *name) { /* Default of ascending direction and insert resource */ unsigned long flags = 0; return get_free_mem_region(NULL, base, size, align, name, IORES_DESC_NONE, flags); } EXPORT_SYMBOL_GPL(alloc_free_mem_region); #endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) strict_iomem_checks = 0; if (strstr(str, "strict")) strict_iomem_checks = 1; return 1; } static int iomem_fs_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type iomem_fs_type = { .name = "iomem", .owner = THIS_MODULE, .init_fs_context = iomem_fs_init_fs_context, .kill_sb = kill_anon_super, }; static int __init iomem_init_inode(void) { static struct vfsmount *iomem_vfs_mount; static int iomem_fs_cnt; struct inode *inode; int rc; rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt); if (rc < 0) { pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc); return rc; } inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb); if (IS_ERR(inode)) { rc = PTR_ERR(inode); pr_err("Cannot allocate inode for iomem: %d\n", rc); simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt); return rc; } /* * Publish iomem revocation inode initialized. * Pairs with smp_load_acquire() in revoke_iomem(). */ smp_store_release(&iomem_inode, inode); return 0; } fs_initcall(iomem_init_inode); __setup("iomem=", strict_iomem);
697 5 404 404 400 404 404 296 30 41 337 337 336 48 4 12 73 16 275 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ADDRCONF_H #define _ADDRCONF_H #define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ #define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ #define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ #define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) #define ADDR_CHECK_FREQUENCY (120*HZ) #define IPV6_MAX_ADDRESSES 16 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ / 50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) #define ADDRCONF_NOTIFY_PRIORITY 0 #include <linux/in.h> #include <linux/in6.h> struct prefix_info { __u8 type; __u8 length; __u8 prefix_len; union __packed { __u8 flags; struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) __u8 onlink : 1, autoconf : 1, routeraddr : 1, preferpd : 1, reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved : 4, preferpd : 1, routeraddr : 1, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif }; }; __be32 valid; __be32 prefered; __be32 reserved2; struct in6_addr prefix; }; /* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ static_assert(sizeof(struct prefix_info) == 32); #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> #include <net/ipv6.h> struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; struct netlink_ext_ack *extack; }; struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; u8 ifa_proto; const struct in6_addr *peer_pfx; u32 rt_priority; u32 ifa_flags; u32 preferred_lft; u32 valid_lft; u16 scope; }; int addrconf_init(void); void addrconf_cleanup(void); int addrconf_add_ifaddr(struct net *net, void __user *arg); int addrconf_del_ifaddr(struct net *net, void __user *arg); int addrconf_set_dstaddr(struct net *net, void __user *arg); int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, bool skip_dev_check, int strict, u32 banned_flags); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, unsigned char nsegs); bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr, struct net_device *dev); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict); int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags); int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, const struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft); static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr) { memcpy(eui, addr, 3); eui[3] = 0xFF; eui[4] = 0xFE; memcpy(eui + 5, addr + 3, 3); } static inline void addrconf_addr_eui48(u8 *eui, const char *const addr) { addrconf_addr_eui48_base(eui, addr); eui[0] ^= 2; } static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) return -1; /* * The zSeries OSA network cards can be shared among various * OS instances, but the OSA cards have only one MAC address. * This leads to duplicate address conflicts in conjunction * with IPv6 if more than one instance uses the same card. * * The driver for these cards can deliver a unique 16-bit * identifier for each instance sharing the same card. It is * placed instead of 0xFFFE in the interface identifier. The * "u" bit of the interface identifier is not inverted in this * case. Hence the resulting interface identifier has local * scope according to RFC2373. */ addrconf_addr_eui48_base(eui, dev->dev_addr); if (dev->dev_id) { eui[3] = (dev->dev_id >> 8) & 0xFF; eui[4] = dev->dev_id & 0xFF; } else { eui[0] ^= 2; } return 0; } #define INFINITY_LIFE_TIME 0xFFFFFFFF static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { if (timeout == INFINITY_LIFE_TIME) return ~0UL; /* * Avoid arithmetic overflow. * Assuming unit is constant and non-zero, this "if" statement * will go away on 64bit archs. */ if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit) return LONG_MAX / unit; return timeout; } static inline int addrconf_finite_timeout(unsigned long timeout) { return ~timeout; } /* * IPv6 Address Label subsystem (addrlabel.c) */ int ipv6_addr_label_init(void); void ipv6_addr_label_cleanup(void); int ipv6_addr_label_rtnl_register(void); u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex); /* * multicast prototypes (mcast.c) */ static inline bool ipv6_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) return false; return pskb_may_pull(skb, len); } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); void ipv6_mc_up(struct inet6_dev *idev); void ipv6_mc_down(struct inet6_dev *idev); void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); void ipv6_mc_dad_complete(struct inet6_dev *idev); /* * identify MLD packets for MLD filter exceptions */ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) { struct icmp6hdr *hdr; if (nexthdr != IPPROTO_ICMPV6 || !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr))) return false; hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (hdr->icmp6_type) { case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: case ICMPV6_MLD2_REPORT: return true; default: break; } return false; } void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao); /* * anycast prototypes (anycast.c) */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_ac_close(struct sock *sk); void ipv6_sock_ac_close(struct sock *sk); int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); void ipv6_ac_destroy_dev(struct inet6_dev *idev); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr); int ipv6_anycast_init(void); void ipv6_anycast_cleanup(void); /* Device notifier */ int register_inet6addr_notifier(struct notifier_block *nb); int unregister_inet6addr_notifier(struct notifier_block *nb); int inet6addr_notifier_call_chain(unsigned long val, void *v); int register_inet6addr_validator_notifier(struct notifier_block *nb); int unregister_inet6addr_validator_notifier(struct notifier_block *nb); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v); void inet6_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->ip6_ptr); } /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device * @skb: skb for original incoming interface if needed * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev, const struct sk_buff *skb) { if (netif_is_l3_master(dev)) dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb)); return __in6_dev_get(dev); } /** * __in6_dev_get_safely - get inet6_dev pointer from netdevice * @dev: network device * * This is a safer version of __in6_dev_get */ static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) { if (likely(dev)) return rcu_dereference_rtnl(dev->ip6_ptr); else return NULL; } /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * This version can be used in any context, and takes a reference * on the inet6_dev. Callers must use in6_dev_put() later to * release this reference. */ static inline struct inet6_dev *in6_dev_get(const struct net_device *dev) { struct inet6_dev *idev; rcu_read_lock(); idev = rcu_dereference(dev->ip6_ptr); if (idev) refcount_inc(&idev->refcnt); rcu_read_unlock(); return idev; } static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev) { struct inet6_dev *idev = __in6_dev_get(dev); return idev ? idev->nd_parms : NULL; } void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void in6_dev_put(struct inet6_dev *idev) { if (refcount_dec_and_test(&idev->refcnt)) in6_dev_finish_destroy(idev); } static inline void in6_dev_put_clear(struct inet6_dev **pidev) { struct inet6_dev *idev = *pidev; if (idev) { in6_dev_put(idev); *pidev = NULL; } } static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); } static inline void in6_dev_hold(struct inet6_dev *idev) { refcount_inc(&idev->refcnt); } /* called with rcu_read_lock held */ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); if (unlikely(!idev)) return true; return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) { if (refcount_dec_and_test(&ifp->refcnt)) inet6_ifa_finish_destroy(ifp); } static inline void __in6_ifa_put(struct inet6_ifaddr *ifp) { refcount_dec(&ifp->refcnt); } static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) { refcount_inc(&ifp->refcnt); } static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) { return refcount_inc_not_zero(&ifp->refcnt); } /* * compute link-local solicited-node multicast address */ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited) { ipv6_addr_set(solicited, htonl(0xFF020000), 0, htonl(0x1), htonl(0xFF000000) | addr->s6_addr32[3]); } static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; #endif } static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; #endif } static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) { return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); } static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) & cpu_to_be64(0xffffffffff000000UL))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x00000001)) | (addr->s6_addr[12] ^ 0xff)) == 0; #endif } static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(0x6a))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0; #endif } #ifdef CONFIG_PROC_FS int if6_proc_init(void); void if6_proc_exit(void); #endif #endif
823 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 #ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H #define _LINUX_UNALIGNED_PACKED_STRUCT_H #include <linux/types.h> struct __una_u16 { u16 x; } __packed; struct __una_u32 { u32 x; } __packed; struct __una_u64 { u64 x; } __packed; static inline u16 __get_unaligned_cpu16(const void *p) { const struct __una_u16 *ptr = (const struct __una_u16 *)p; return ptr->x; } static inline u32 __get_unaligned_cpu32(const void *p) { const struct __una_u32 *ptr = (const struct __una_u32 *)p; return ptr->x; } static inline u64 __get_unaligned_cpu64(const void *p) { const struct __una_u64 *ptr = (const struct __una_u64 *)p; return ptr->x; } static inline void __put_unaligned_cpu16(u16 val, void *p) { struct __una_u16 *ptr = (struct __una_u16 *)p; ptr->x = val; } static inline void __put_unaligned_cpu32(u32 val, void *p) { struct __una_u32 *ptr = (struct __una_u32 *)p; ptr->x = val; } static inline void __put_unaligned_cpu64(u64 val, void *p) { struct __una_u64 *ptr = (struct __una_u64 *)p; ptr->x = val; } #endif /* _LINUX_UNALIGNED_PACKED_STRUCT_H */
3 4 3 4 3 3 1 4 4 3 4 2 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ #ifndef _EXT4_EXTENTS #define _EXT4_EXTENTS #include "ext4.h" /* * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks * becomes very small, so index split, in-depth growing and * other hard changes happen much more often. * This is for debug purposes only. */ #define AGGRESSIVE_TEST_ /* * With EXTENTS_STATS defined, the number of blocks and extents * are collected in the truncate path. They'll be shown at * umount time. */ #define EXTENTS_STATS__ /* * If CHECK_BINSEARCH is defined, then the results of the binary search * will also be checked by linear search. */ #define CHECK_BINSEARCH__ /* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. */ #define EXT_STATS_ /* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. * For non-inode extent blocks, ext4_extent_tail * follows the array. */ /* * This is the extent tail on-disk structure. * All other extent structures are 12 bytes long. It turns out that * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which * covers all valid ext4 block sizes. Therefore, this tail structure can be * crammed into the end of the block without having to rebalance the tree. */ struct ext4_extent_tail { __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ }; /* * This is the extent on-disk structure. * It's used at the bottom of the tree. */ struct ext4_extent { __le32 ee_block; /* first logical block extent covers */ __le16 ee_len; /* number of blocks covered by extent */ __le16 ee_start_hi; /* high 16 bits of physical block */ __le32 ee_start_lo; /* low 32 bits of physical block */ }; /* * This is index on-disk structure. * It's used at all the levels except the bottom. */ struct ext4_extent_idx { __le32 ei_block; /* index covers logical blocks from 'block' */ __le32 ei_leaf_lo; /* pointer to the physical block of the next * * level. leaf or next index could be there */ __le16 ei_leaf_hi; /* high 16 bits of physical block */ __u16 ei_unused; }; /* * Each block (leaves and indexes), even inode-stored has header. */ struct ext4_extent_header { __le16 eh_magic; /* probably will support different formats */ __le16 eh_entries; /* number of valid entries */ __le16 eh_max; /* capacity of store in entries */ __le16 eh_depth; /* has tree real underlying blocks? */ __le32 eh_generation; /* generation of the tree */ }; #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) #define EXT4_MAX_EXTENT_DEPTH 5 #define EXT4_EXTENT_TAIL_OFFSET(hdr) \ (sizeof(struct ext4_extent_header) + \ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) static inline struct ext4_extent_tail * find_ext4_extent_tail(struct ext4_extent_header *eh) { return (struct ext4_extent_tail *)(((void *)eh) + EXT4_EXTENT_TAIL_OFFSET(eh)); } /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. * Truncate uses it to simulate recursive walking. */ struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; struct buffer_head *p_bh; }; /* * Used to record a portion of a cluster found at the beginning or end * of an extent while traversing the extent tree during space removal. * A partial cluster may be removed if it does not contain blocks shared * with extents that aren't being deleted (tofree state). Otherwise, * it cannot be removed (nofree state). */ struct partial_cluster { ext4_fsblk_t pclu; /* physical cluster number */ ext4_lblk_t lblk; /* logical block number within logical cluster */ enum {initial, tofree, nofree} state; }; /* * structure for external API */ /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this * particular extent is an initialized extent or an unwritten (i.e. * preallocated). * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an * unwritten one. In other words, if MSB of ee_len is set, it is an * unwritten extent with only one special scenario when ee_len = 0x8000. * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) #define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ ((struct ext4_extent *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_FIRST_INDEX(__hdr__) \ ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_HAS_FREE_INDEX(__path__) \ (le16_to_cpu((__path__)->p_hdr->eh_entries) \ < le16_to_cpu((__path__)->p_hdr->eh_max)) #define EXT_LAST_EXTENT(__hdr__) \ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : NULL) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { return (struct ext4_extent_header *) EXT4_I(inode)->i_data; } static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) { return (struct ext4_extent_header *) bh->b_data; } static inline unsigned short ext_depth(struct inode *inode) { return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); } static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) { return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? le16_to_cpu(ext->ee_len) : (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) { ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } /* * ext4_ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; block = le32_to_cpu(ex->ee_start_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; return block; } /* * ext4_idx_pblock: * combine low and high parts of a leaf physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) { ext4_fsblk_t block; block = le32_to_cpu(ix->ei_leaf_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; return block; } /* * ext4_ext_store_pblock: * stores a large physical block number into an extent struct, * breaking it into parts */ static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) { ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } /* * ext4_idx_store_pblock: * stores a large physical block number into an index struct, * breaking it into parts */ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) { ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } #endif /* _EXT4_EXTENTS */
61 61 61 61 61 61 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 // SPDX-License-Identifier: GPL-2.0-or-later /* rxrpc network namespace handling. * * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include "ar-internal.h" unsigned int rxrpc_net_id; static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_net *rxnet = container_of(timer, struct rxrpc_net, service_conn_reap_timer); if (rxnet->live) rxrpc_queue_work(&rxnet->service_conn_reaper); } static void rxrpc_peer_keepalive_timeout(struct timer_list *timer) { struct rxrpc_net *rxnet = container_of(timer, struct rxrpc_net, peer_keepalive_timer); if (rxnet->live) rxrpc_queue_work(&rxnet->peer_keepalive_work); } /* * Initialise a per-network namespace record. */ static __net_init int rxrpc_init_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); int ret, i; rxnet->live = true; get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); rxnet->epoch |= RXRPC_RANDOM_EPOCH; INIT_LIST_HEAD(&rxnet->calls); spin_lock_init(&rxnet->call_lock); atomic_set(&rxnet->nr_calls, 1); atomic_set(&rxnet->nr_conns, 1); INIT_LIST_HEAD(&rxnet->bundle_proc_list); INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); INIT_WORK(&rxnet->service_conn_reaper, rxrpc_service_connection_reaper); timer_setup(&rxnet->service_conn_reap_timer, rxrpc_service_conn_reap_timeout, 0); atomic_set(&rxnet->nr_client_conns, 0); INIT_HLIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); hash_init(rxnet->peer_hash); spin_lock_init(&rxnet->peer_hash_lock); for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++) INIT_LIST_HEAD(&rxnet->peer_keepalive[i]); INIT_LIST_HEAD(&rxnet->peer_keepalive_new); timer_setup(&rxnet->peer_keepalive_timer, rxrpc_peer_keepalive_timeout, 0); INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker); rxnet->peer_keepalive_base = ktime_get_seconds(); ret = -ENOMEM; rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net); if (!rxnet->proc_net) goto err_proc; proc_create_net("calls", 0444, rxnet->proc_net, &rxrpc_call_seq_ops, sizeof(struct seq_net_private)); proc_create_net("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_ops, sizeof(struct seq_net_private)); proc_create_net("bundles", 0444, rxnet->proc_net, &rxrpc_bundle_seq_ops, sizeof(struct seq_net_private)); proc_create_net("peers", 0444, rxnet->proc_net, &rxrpc_peer_seq_ops, sizeof(struct seq_net_private)); proc_create_net("locals", 0444, rxnet->proc_net, &rxrpc_local_seq_ops, sizeof(struct seq_net_private)); proc_create_net_single_write("stats", S_IFREG | 0644, rxnet->proc_net, rxrpc_stats_show, rxrpc_stats_clear, NULL); return 0; err_proc: rxnet->live = false; return ret; } /* * Clean up a per-network namespace record. */ static __net_exit void rxrpc_exit_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); /* Remove the timer again as the worker may have restarted it. */ del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_peers(rxnet); rxrpc_destroy_all_locals(rxnet); proc_remove(rxnet->proc_net); } struct pernet_operations rxrpc_net_ops = { .init = rxrpc_init_net, .exit = rxrpc_exit_net, .id = &rxrpc_net_id, .size = sizeof(struct rxrpc_net), };
1 1 2 18 18 11 8 6 6 5 5 13 2 18 18 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/in.h> #include <linux/if.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/if_arp.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/module.h> #include <net/addrconf.h> #include "rds_single_path.h" #include "rds.h" #include "ib.h" #include "ib_mr.h" static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE; static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; static atomic_t rds_ib_unloading; module_param(rds_ib_mr_1m_pool_size, int, 0444); MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA"); module_param(rds_ib_mr_8k_pool_size, int, 0444); MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); /* * we have a clumsy combination of RCU and a rwsem protecting this list * because it is used both in the get_mr fast path and while blocking in * the FMR flushing path. */ DECLARE_RWSEM(rds_ib_devices_lock); struct list_head rds_ib_devices; /* NOTE: if also grabbing ibdev lock, grab this first */ DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); static void rds_ib_nodev_connect(void) { struct rds_ib_connection *ic; spin_lock(&ib_nodev_conns_lock); list_for_each_entry(ic, &ib_nodev_conns, ib_node) rds_conn_connect_if_down(ic->conn); spin_unlock(&ib_nodev_conns_lock); } static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) { struct rds_ib_connection *ic; unsigned long flags; spin_lock_irqsave(&rds_ibdev->spinlock, flags); list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) rds_conn_path_drop(&ic->conn->c_path[0], true); spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); } /* * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references * from interrupt context so we push freing off into a work struct in krdsd. */ static void rds_ib_dev_free(struct work_struct *work) { struct rds_ib_ipaddr *i_ipaddr, *i_next; struct rds_ib_device *rds_ibdev = container_of(work, struct rds_ib_device, free_work); if (rds_ibdev->mr_8k_pool) rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); if (rds_ibdev->mr_1m_pool) rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); kfree(i_ipaddr); } kfree(rds_ibdev->vector_load); kfree(rds_ibdev); } void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) { BUG_ON(refcount_read(&rds_ibdev->refcount) == 0); if (refcount_dec_and_test(&rds_ibdev->refcount)) queue_work(rds_wq, &rds_ibdev->free_work); } static int rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; int ret; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return -EOPNOTSUPP; /* Device must support FRWR */ if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) return -EOPNOTSUPP; rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) return -ENOMEM; spin_lock_init(&rds_ibdev->spinlock); refcount_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); rds_ibdev->odp_capable = !!(device->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING) && !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & IB_ODP_SUPPORT_WRITE) && !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & IB_ODP_SUPPORT_READ); rds_ibdev->max_1m_mrs = device->attrs.max_mr ? min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; rds_ibdev->max_8k_mrs = device->attrs.max_mr ? min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size; rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; rds_ibdev->vector_load = kcalloc(device->num_comp_vectors, sizeof(int), GFP_KERNEL); if (!rds_ibdev->vector_load) { pr_err("RDS/IB: %s failed to allocate vector memory\n", __func__); ret = -ENOMEM; goto put_dev; } rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { ret = PTR_ERR(rds_ibdev->pd); rds_ibdev->pd = NULL; goto put_dev; } rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); if (IS_ERR(rds_ibdev->mr_1m_pool)) { ret = PTR_ERR(rds_ibdev->mr_1m_pool); rds_ibdev->mr_1m_pool = NULL; goto put_dev; } rds_ibdev->mr_8k_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); if (IS_ERR(rds_ibdev->mr_8k_pool)) { ret = PTR_ERR(rds_ibdev->mr_8k_pool); rds_ibdev->mr_8k_pool = NULL; goto put_dev; } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs); pr_info("RDS/IB: %s: added\n", device->name); down_write(&rds_ib_devices_lock); list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); up_write(&rds_ib_devices_lock); refcount_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); rds_ib_nodev_connect(); return 0; put_dev: rds_ib_dev_put(rds_ibdev); return ret; } /* * New connections use this to find the device to associate with the * connection. It's not in the fast path so we're not concerned about the * performance of the IB call. (As of this writing, it uses an interrupt * blocking spinlock to serialize walking a per-device list of all registered * clients.) * * RCU is used to handle incoming connections racing with device teardown. * Rather than use a lock to serialize removal from the client_data and * getting a new reference, we use an RCU grace period. The destruction * path removes the device from client_data and then waits for all RCU * readers to finish. * * A new connection can get NULL from this if its arriving on a * device that is in the process of being removed. */ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) { struct rds_ib_device *rds_ibdev; rcu_read_lock(); rds_ibdev = ib_get_client_data(device, &rds_ib_client); if (rds_ibdev) refcount_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } /* * The IB stack is letting us know that a device is going away. This can * happen if the underlying HCA driver is removed or if PCI hotplug is removing * the pci function, for example. * * This can be called at any time and can be racing with any other RDS path. */ static void rds_ib_remove_one(struct ib_device *device, void *client_data) { struct rds_ib_device *rds_ibdev = client_data; rds_ib_dev_shutdown(rds_ibdev); /* stop connection attempts from getting a reference to this device. */ ib_set_client_data(device, &rds_ib_client, NULL); down_write(&rds_ib_devices_lock); list_del_rcu(&rds_ibdev->list); up_write(&rds_ib_devices_lock); /* * This synchronize rcu is waiting for readers of both the ib * client data and the devices list to finish before we drop * both of those references. */ synchronize_rcu(); rds_ib_dev_put(rds_ibdev); rds_ib_dev_put(rds_ibdev); } struct ib_client rds_ib_client = { .name = "rds_ib", .add = rds_ib_add_one, .remove = rds_ib_remove_one }; static int rds_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds_info_rdma_connection *iinfo = buffer; struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; if (conn->c_isv6) return 0; iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; if (ic) { iinfo->tos = conn->c_tos; iinfo->sl = ic->i_sl; } memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid, (union ib_gid *)&iinfo->dst_gid); rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; rds_ib_get_mr_info(rds_ibdev, iinfo); iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } #if IS_ENABLED(CONFIG_IPV6) /* IPv6 version of rds_ib_conn_info_visitor(). */ static int rds6_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { struct rds6_info_rdma_connection *iinfo6 = buffer; struct rds_ib_connection *ic = conn->c_transport_data; /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; iinfo6->src_addr = conn->c_laddr; iinfo6->dst_addr = conn->c_faddr; if (ic) { iinfo6->tos = conn->c_tos; iinfo6->sl = ic->i_sl; } memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid, (union ib_gid *)&iinfo6->dst_gid); rds_ibdev = ic->rds_ibdev; iinfo6->max_send_wr = ic->i_send_ring.w_nr; iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; iinfo6->max_send_sge = rds_ibdev->max_sge; rds6_ib_get_mr_info(rds_ibdev, iinfo6); iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } #endif static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8]; rds_for_each_conn_info(sock, len, iter, lens, rds_ib_conn_info_visitor, buffer, sizeof(struct rds_info_rdma_connection)); } #if IS_ENABLED(CONFIG_IPV6) /* IPv6 version of rds_ib_ic_info(). */ static void rds6_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8]; rds_for_each_conn_info(sock, len, iter, lens, rds6_ib_conn_info_visitor, buffer, sizeof(struct rds6_info_rdma_connection)); } #endif /* * Early RDS/IB was built to only bind to an address if there is an IPoIB * device with that address set. * * If it were me, I'd advocate for something more flexible. Sending and * receiving should be device-agnostic. Transports would try and maintain * connections between peers who have messages queued. Userspace would be * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, __u32 scope_id) { int ret; struct rdma_cm_id *cm_id; #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 sin6; #endif struct sockaddr_in sin; struct sockaddr *sa; bool isv4; isv4 = ipv6_addr_v4mapped(addr); /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); if (isv4) { memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_addr.s_addr = addr->s6_addr32[3]; sa = (struct sockaddr *)&sin; } else { #if IS_ENABLED(CONFIG_IPV6) memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr; sin6.sin6_scope_id = scope_id; sa = (struct sockaddr *)&sin6; /* XXX Do a special IPv6 link local address check here. The * reason is that rdma_bind_addr() always succeeds with IPv6 * link local address regardless it is indeed configured in a * system. */ if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { struct net_device *dev; if (scope_id == 0) { ret = -EADDRNOTAVAIL; goto out; } /* Use init_net for now as RDS is not network * name space aware. */ dev = dev_get_by_index(&init_net, scope_id); if (!dev) { ret = -EADDRNOTAVAIL; goto out; } if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { dev_put(dev); ret = -EADDRNOTAVAIL; goto out; } dev_put(dev); } #else ret = -EADDRNOTAVAIL; goto out; #endif } /* rdma_bind_addr will only succeed for IB & iWARP devices */ ret = rdma_bind_addr(cm_id, sa); /* due to this, we will claim to support iWARP devices unless we check node_type. */ if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI6c%%%u ret %d node type %d\n", addr, scope_id, ret, cm_id->device ? cm_id->device->node_type : -1); out: rdma_destroy_id(cm_id); return ret; } static void rds_ib_unregister_client(void) { ib_unregister_client(&rds_ib_client); /* wait for rds_ib_dev_free() to complete */ flush_workqueue(rds_wq); } static void rds_ib_set_unloading(void) { atomic_set(&rds_ib_unloading, 1); } static bool rds_ib_is_unloading(struct rds_connection *conn) { struct rds_conn_path *cp = &conn->c_path[0]; return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) || atomic_read(&rds_ib_unloading) != 0); } void rds_ib_exit(void) { rds_ib_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); #endif rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); rds_ib_mr_exit(); } static u8 rds_ib_get_tos_map(u8 tos) { /* 1:1 user to transport map for RDMA transport. * In future, if custom map is desired, hook can export * user configurable map. */ return tos; } struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_path_complete = rds_ib_xmit_path_complete, .xmit = rds_ib_xmit, .xmit_rdma = rds_ib_xmit_rdma, .xmit_atomic = rds_ib_xmit_atomic, .recv_path = rds_ib_recv_path, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, .conn_path_connect = rds_ib_conn_path_connect, .conn_path_shutdown = rds_ib_conn_path_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_handle_connect = rds_ib_cm_handle_connect, .cm_connect_complete = rds_ib_cm_connect_complete, .stats_info_copy = rds_ib_stats_info_copy, .exit = rds_ib_exit, .get_mr = rds_ib_get_mr, .sync_mr = rds_ib_sync_mr, .free_mr = rds_ib_free_mr, .flush_mrs = rds_ib_flush_mrs, .get_tos_map = rds_ib_get_tos_map, .t_owner = THIS_MODULE, .t_name = "infiniband", .t_unloading = rds_ib_is_unloading, .t_type = RDS_TRANS_IB }; int rds_ib_init(void) { int ret; INIT_LIST_HEAD(&rds_ib_devices); ret = rds_ib_mr_init(); if (ret) goto out; ret = ib_register_client(&rds_ib_client); if (ret) goto out_mr_exit; ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; ret = rds_ib_recv_init(); if (ret) goto out_sysctl; rds_trans_register(&rds_ib_transport); rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); #endif goto out; out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); out_mr_exit: rds_ib_mr_exit(); out: return ret; } MODULE_LICENSE("GPL");
92 207 5 12 13 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM io_uring #if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IO_URING_H #include <linux/tracepoint.h> #include <uapi/linux/io_uring.h> #include <linux/io_uring_types.h> #include <linux/io_uring.h> struct io_wq_work; /** * io_uring_create - called after a new io_uring context was prepared * * @fd: corresponding file descriptor * @ctx: pointer to a ring context structure * @sq_entries: actual SQ size * @cq_entries: actual CQ size * @flags: SQ ring flags, provided to io_uring_setup(2) * * Allows to trace io_uring creation and provide pointer to a context, that can * be used later to find correlated events. */ TRACE_EVENT(io_uring_create, TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), TP_STRUCT__entry ( __field( int, fd ) __field( void *, ctx ) __field( u32, sq_entries ) __field( u32, cq_entries ) __field( u32, flags ) ), TP_fast_assign( __entry->fd = fd; __entry->ctx = ctx; __entry->sq_entries = sq_entries; __entry->cq_entries = cq_entries; __entry->flags = flags; ), TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x", __entry->ctx, __entry->fd, __entry->sq_entries, __entry->cq_entries, __entry->flags) ); /** * io_uring_register - called after a buffer/file/eventfd was successfully * registered for a ring * * @ctx: pointer to a ring context structure * @opcode: describes which operation to perform * @nr_user_files: number of registered files * @nr_user_bufs: number of registered buffers * @ret: return code * * Allows to trace fixed files/buffers, that could be registered to * avoid an overhead of getting references to them for every operation. This * event, together with io_uring_file_get, can provide a full picture of how * much overhead one can reduce via fixing. */ TRACE_EVENT(io_uring_register, TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, unsigned nr_bufs, long ret), TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned, opcode ) __field( unsigned, nr_files) __field( unsigned, nr_bufs ) __field( long, ret ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->nr_files = nr_files; __entry->nr_bufs = nr_bufs; __entry->ret = ret; ), TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " "ret %ld", __entry->ctx, __entry->opcode, __entry->nr_files, __entry->nr_bufs, __entry->ret) ); /** * io_uring_file_get - called before getting references to an SQE file * * @req: pointer to a submitted request * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can * help figuring out if it makes sense to use fixed files, or check that fixed * files are used correctly. */ TRACE_EVENT(io_uring_file_get, TP_PROTO(struct io_kiocb *req, int fd), TP_ARGS(req, fd), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, fd ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->fd = fd; ), TP_printk("ring %p, req %p, user_data 0x%llx, fd %d", __entry->ctx, __entry->req, __entry->user_data, __entry->fd) ); /** * io_uring_queue_async_work - called before submitting a new async work * * @req: pointer to a submitted request * @rw: type of workqueue, hashed or normal * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, TP_PROTO(struct io_kiocb *req, int rw), TP_ARGS(req, rw), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->flags = (__force unsigned long long) req->flags; __entry->opcode = req->opcode; __entry->work = &req->work; __entry->rw = rw; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); /** * io_uring_defer - called when an io_uring request is deferred * * @req: pointer to a deferred request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, data ) __field( u8, opcode ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->data = req->cqe.user_data; __entry->opcode = req->opcode; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", __entry->ctx, __entry->req, __entry->data, __get_str(op_str)) ); /** * io_uring_link - called before the io_uring request added into link_list of * another request * * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * * Allows to track linked requests, to understand dependencies between requests * and how does it influence their execution flow. */ TRACE_EVENT(io_uring_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req), TP_ARGS(req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( void *, target_req ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->target_req = target_req; ), TP_printk("ring %p, request %p linked after %p", __entry->ctx, __entry->req, __entry->target_req) ); /** * io_uring_cqring_wait - called before start waiting for an available CQE * * @ctx: pointer to a ring context structure * @min_events: minimal number of events to wait for * * Allows to track waiting for CQE, so that we can e.g. troubleshoot * situations, when an application wants to wait for an event, that never * comes. */ TRACE_EVENT(io_uring_cqring_wait, TP_PROTO(void *ctx, int min_events), TP_ARGS(ctx, min_events), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, min_events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->min_events = min_events; ), TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) ); /** * io_uring_fail_link - called before failing a linked request * * @req: request, which links were cancelled * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work * was cancelled, but also which request was the reason. */ TRACE_EVENT(io_uring_fail_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *link), TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( void *, link ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->link = link; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->link) ); /** * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure * @req: (optional) pointer to a submitted request * @cqe: pointer to the filled in CQE being posted */ TRACE_EVENT(io_uring_complete, TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), TP_ARGS(ctx, req, cqe), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, res ) __field( unsigned, cflags ) __field( u64, extra1 ) __field( u64, extra2 ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " "extra1 %llu extra2 %llu ", __entry->ctx, __entry->req, __entry->user_data, __entry->res, __entry->cflags, (unsigned long long) __entry->extra1, (unsigned long long) __entry->extra2) ); /** * io_uring_submit_req - called before submitting a request * * @req: pointer to a submitted request * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_req, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->flags = (__force unsigned long long) req->flags; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, " "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->sq_thread) ); /* * io_uring_poll_arm - called after arming a poll wait if successful * * @req: pointer to the armed request * @mask: request poll events mask * @events: registered events of interest * * Allows to track which fds are waiting for and what are the events of * interest. */ TRACE_EVENT(io_uring_poll_arm, TP_PROTO(struct io_kiocb *req, int mask, int events), TP_ARGS(req, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __field( int, events ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __entry->events = events; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask, __entry->events) ); /* * io_uring_task_add - called after adding a task * * @req: pointer to request * @mask: request poll events mask * */ TRACE_EVENT(io_uring_task_add, TP_PROTO(struct io_kiocb *req, int mask), TP_ARGS(req, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask) ); /* * io_uring_req_failed - called when an sqe is errored dring submission * * @sqe: pointer to the io_uring_sqe that failed * @req: pointer to request * @error: error it failed with * * Allows easier diagnosing of malformed requests in production systems. */ TRACE_EVENT(io_uring_req_failed, TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error), TP_ARGS(sqe, req, error), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( u8, flags ) __field( u8, ioprio ) __field( u64, off ) __field( u64, addr ) __field( u32, len ) __field( u32, op_flags ) __field( u16, buf_index ) __field( u16, personality ) __field( u32, file_index ) __field( u64, pad1 ) __field( u64, addr3 ) __field( int, error ) __string( op_str, io_uring_get_opcode(sqe->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = sqe->user_data; __entry->opcode = sqe->opcode; __entry->flags = sqe->flags; __entry->ioprio = sqe->ioprio; __entry->off = sqe->off; __entry->addr = sqe->addr; __entry->len = sqe->len; __entry->op_flags = sqe->poll32_events; __entry->buf_index = sqe->buf_index; __entry->personality = sqe->personality; __entry->file_index = sqe->file_index; __entry->pad1 = sqe->__pad2[0]; __entry->addr3 = sqe->addr3; __entry->error = error; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, " "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, buf_index=%d, " "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " "error=%d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, __entry->op_flags, __entry->buf_index, __entry->personality, __entry->file_index, (unsigned long long) __entry->pad1, (unsigned long long) __entry->addr3, __entry->error) ); /* * io_uring_cqe_overflow - a CQE overflowed * * @ctx: pointer to a ring context structure * @user_data: user data associated with the request * @res: CQE result * @cflags: CQE flags * @ocqe: pointer to the overflow cqe (if available) * */ TRACE_EVENT(io_uring_cqe_overflow, TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags, void *ocqe), TP_ARGS(ctx, user_data, res, cflags, ocqe), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned long long, user_data ) __field( s32, res ) __field( u32, cflags ) __field( void *, ocqe ) ), TP_fast_assign( __entry->ctx = ctx; __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; __entry->ocqe = ocqe; ), TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, " "overflow_cqe %p", __entry->ctx, __entry->user_data, __entry->res, __entry->cflags, __entry->ocqe) ); /* * io_uring_task_work_run - ran task work * * @tctx: pointer to a io_uring_task * @count: how many functions it ran * */ TRACE_EVENT(io_uring_task_work_run, TP_PROTO(void *tctx, unsigned int count), TP_ARGS(tctx, count), TP_STRUCT__entry ( __field( void *, tctx ) __field( unsigned int, count ) ), TP_fast_assign( __entry->tctx = tctx; __entry->count = count; ), TP_printk("tctx %p, count %u", __entry->tctx, __entry->count) ); TRACE_EVENT(io_uring_short_write, TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got), TP_ARGS(ctx, fpos, wanted, got), TP_STRUCT__entry( __field(void *, ctx) __field(u64, fpos) __field(u64, wanted) __field(u64, got) ), TP_fast_assign( __entry->ctx = ctx; __entry->fpos = fpos; __entry->wanted = wanted; __entry->got = got; ), TP_printk("ring %p, fpos %lld, wanted %lld, got %lld", __entry->ctx, __entry->fpos, __entry->wanted, __entry->got) ); /* * io_uring_local_work_run - ran ring local task work * * @tctx: pointer to a io_uring_ctx * @count: how many functions it ran * @loops: how many loops it ran * */ TRACE_EVENT(io_uring_local_work_run, TP_PROTO(void *ctx, int count, unsigned int loops), TP_ARGS(ctx, count, loops), TP_STRUCT__entry ( __field(void *, ctx ) __field(int, count ) __field(unsigned int, loops ) ), TP_fast_assign( __entry->ctx = ctx; __entry->count = count; __entry->loops = loops; ), TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops) ); #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
734 742 741 736 734 736 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 // SPDX-License-Identifier: GPL-2.0-only // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner #include <linux/atomic.h> #include <linux/bug.h> #include <linux/console.h> #include <linux/delay.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/init.h> #include <linux/irqflags.h> #include <linux/kthread.h> #include <linux/minmax.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include "internal.h" #include "printk_ringbuffer.h" /* * Printk console printing implementation for consoles which does not depend * on the legacy style console_lock mechanism. * * The state of the console is maintained in the "nbcon_state" atomic * variable. * * The console is locked when: * * - The 'prio' field contains the priority of the context that owns the * console. Only higher priority contexts are allowed to take over the * lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked. * * - The 'cpu' field denotes on which CPU the console is locked. It is used * to prevent busy waiting on the same CPU. Also it informs the lock owner * that it has lost the lock in a more complex scenario when the lock was * taken over by a higher priority context, released, and taken on another * CPU with the same priority as the interrupted owner. * * The acquire mechanism uses a few more fields: * * - The 'req_prio' field is used by the handover approach to make the * current owner aware that there is a context with a higher priority * waiting for the friendly handover. * * - The 'unsafe' field allows to take over the console in a safe way in the * middle of emitting a message. The field is set only when accessing some * shared resources or when the console device is manipulated. It can be * cleared, for example, after emitting one character when the console * device is in a consistent state. * * - The 'unsafe_takeover' field is set when a hostile takeover took the * console in an unsafe state. The console will stay in the unsafe state * until re-initialized. * * The acquire mechanism uses three approaches: * * 1) Direct acquire when the console is not owned or is owned by a lower * priority context and is in a safe state. * * 2) Friendly handover mechanism uses a request/grant handshake. It is used * when the current owner has lower priority and the console is in an * unsafe state. * * The requesting context: * * a) Sets its priority into the 'req_prio' field. * * b) Waits (with a timeout) for the owning context to unlock the * console. * * c) Takes the lock and clears the 'req_prio' field. * * The owning context: * * a) Observes the 'req_prio' field set on exit from the unsafe * console state. * * b) Gives up console ownership by clearing the 'prio' field. * * 3) Unsafe hostile takeover allows to take over the lock even when the * console is an unsafe state. It is used only in panic() by the final * attempt to flush consoles in a try and hope mode. * * Note that separate record buffers are used in panic(). As a result, * the messages can be read and formatted without any risk even after * using the hostile takeover in unsafe state. * * The release function simply clears the 'prio' field. * * All operations on @console::nbcon_state are atomic cmpxchg based to * handle concurrency. * * The acquire/release functions implement only minimal policies: * * - Preference for higher priority contexts. * - Protection of the panic CPU. * * All other policy decisions must be made at the call sites: * * - What is marked as an unsafe section. * - Whether to spin-wait if there is already an owner and the console is * in an unsafe state. * - Whether to attempt an unsafe hostile takeover. * * The design allows to implement the well known: * * acquire() * output_one_printk_record() * release() * * The output of one printk record might be interrupted with a higher priority * context. The new owner is supposed to reprint the entire interrupted record * from scratch. */ /** * nbcon_state_set - Helper function to set the console state * @con: Console to update * @new: The new state to write * * Only to be used when the console is not yet or no longer visible in the * system. Otherwise use nbcon_state_try_cmpxchg(). */ static inline void nbcon_state_set(struct console *con, struct nbcon_state *new) { atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom); } /** * nbcon_state_read - Helper function to read the console state * @con: Console to read * @state: The state to store the result */ static inline void nbcon_state_read(struct console *con, struct nbcon_state *state) { state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state)); } /** * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state * @con: Console to update * @cur: Old/expected state * @new: New state * * Return: True on success. False on fail and @cur is updated. */ static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur, struct nbcon_state *new) { return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom); } /** * nbcon_seq_read - Read the current console sequence * @con: Console to read the sequence of * * Return: Sequence number of the next record to print on @con. */ u64 nbcon_seq_read(struct console *con) { unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq)); return __ulseq_to_u64seq(prb, nbcon_seq); } /** * nbcon_seq_force - Force console sequence to a specific value * @con: Console to work on * @seq: Sequence number value to set * * Only to be used during init (before registration) or in extreme situations * (such as panic with CONSOLE_REPLAY_ALL). */ void nbcon_seq_force(struct console *con, u64 seq) { /* * If the specified record no longer exists, the oldest available record * is chosen. This is especially important on 32bit systems because only * the lower 32 bits of the sequence number are stored. The upper 32 bits * are derived from the sequence numbers available in the ringbuffer. */ u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); } /** * nbcon_seq_try_update - Try to update the console sequence number * @ctxt: Pointer to an acquire context that contains * all information about the acquire mode * @new_seq: The new sequence number to set * * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to * the 64bit value). This could be a different value than @new_seq if * nbcon_seq_force() was used or the current context no longer owns the * console. In the later case, it will stop printing anyway. */ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) { unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq); struct console *con = ctxt->console; if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq, __u64seq_to_ulseq(new_seq))) { ctxt->seq = new_seq; } else { ctxt->seq = nbcon_seq_read(con); } } /** * nbcon_context_try_acquire_direct - Try to acquire directly * @ctxt: The context of the caller * @cur: The current console state * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU. * Or the current owner or waiter has the same or higher * priority. No acquire method can be successful in * this case. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; do { /* * Panic does not imply that the console is owned. However, it * is critical that non-panic CPUs during panic are unable to * acquire ownership in order to satisfy the assumptions of * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ if (other_cpu_in_panic()) return -EPERM; if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; if (cur->unsafe) return -EBUSY; /* * The console should never be safe for a direct acquire * if an unsafe hostile takeover has ever happened. */ WARN_ON_ONCE(cur->unsafe_takeover); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) { /* * The request context is well defined by the @req_prio because: * * - Only a context with a priority higher than the owner can become * a waiter. * - Only a context with a priority higher than the waiter can * directly take over the request. * - There are only three priorities. * - Only one CPU is allowed to request PANIC priority. * - Lower priorities are ignored during panic() until reboot. * * As a result, the following scenario is *not* possible: * * 1. This context is currently a waiter. * 2. Another context with a higher priority than this context * directly takes ownership. * 3. The higher priority context releases the ownership. * 4. Another lower priority context takes the ownership. * 5. Another context with the same priority as this context * creates a request and starts waiting. * * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Events #4 and #5 are not possible due to the other_cpu_in_panic() * check in nbcon_context_try_acquire_direct(). */ return (cur->req_prio == expected_prio); } /** * nbcon_context_try_acquire_requested - Try to acquire after having * requested a handover * @ctxt: The context of the caller * @cur: The current console state * * This is a helper function for nbcon_context_try_acquire_handover(). * It is called when the console is in an unsafe state. The current * owner will release the console on exit from the unsafe region. * * Return: 0 on success and @cur is updated to the new console state. * Otherwise an error code on failure. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU * or this context is no longer the waiter. * * -EBUSY: The console is still locked. The caller should * continue waiting. * * Note: The caller must still remove the request when an error has occurred * except when this context is no longer the waiter. */ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; /* Note that the caller must still remove the request! */ if (other_cpu_in_panic()) return -EPERM; /* * Note that the waiter will also change if there was an unsafe * hostile takeover. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* If still locked, caller should continue waiting. */ if (cur->prio != NBCON_PRIO_NONE) return -EBUSY; /* * The previous owner should have never released ownership * in an unsafe region. */ WARN_ON_ONCE(cur->unsafe); new.atom = cur->atom; new.prio = ctxt->prio; new.req_prio = NBCON_PRIO_NONE; new.unsafe = cur->unsafe_takeover; new.cpu = cpu; if (!nbcon_state_try_cmpxchg(con, cur, &new)) { /* * The acquire could fail only when it has been taken * over by a higher priority context. */ WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio)); return -EPERM; } /* Handover success. This context now owns the console. */ return 0; } /** * nbcon_context_try_acquire_handover - Try to acquire via handover * @ctxt: The context of the caller * @cur: The current console state * * The function must be called only when the context has higher priority * than the current owner and the console is in an unsafe state. * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY. * * The function sets "req_prio" field to make the current owner aware of * the request. Then it waits until the current owner releases the console, * or an even higher context takes over the request, or timeout expires. * * The current owner checks the "req_prio" field on exit from the unsafe * region and releases the console. It does not touch the "req_prio" field * so that the console stays reserved for the waiter. * * Return: 0 on success. Otherwise, an error code on failure. Also @cur * is updated to the latest state when failed to modify it. * * Errors: * * -EPERM: A panic is in progress and this is not the panic CPU. * Or a higher priority context has taken over the * console or the handover request. * * -EBUSY: The current owner is on the same CPU so that the hand * shake could not work. Or the current owner is not * willing to wait (zero timeout). Or the console does * not enter the safe state before timeout passed. The * caller might still use the unsafe hostile takeover * when allowed. * * -EAGAIN: @cur has changed when creating the handover request. * The caller should retry with direct acquire. */ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; int timeout; int request_err = -EBUSY; /* * Check that the handover is called when the direct acquire failed * with -EBUSY. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; /* * Console stays unsafe after an unsafe takeover until re-initialized. * Waiting is not going to help in this case. */ if (cur->unsafe_takeover) return -EBUSY; /* Is the caller willing to wait? */ if (ctxt->spinwait_max_us == 0) return -EBUSY; /* * Setup a request for the handover. The caller should try to acquire * the console directly when the current state has been modified. */ new.atom = cur->atom; new.req_prio = ctxt->prio; if (!nbcon_state_try_cmpxchg(con, cur, &new)) return -EAGAIN; cur->atom = new.atom; /* Wait until there is no owner and then acquire the console. */ for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) { /* On successful acquire, this request is cleared. */ request_err = nbcon_context_try_acquire_requested(ctxt, cur); if (!request_err) return 0; /* * If the acquire should be aborted, it must be ensured * that the request is removed before returning to caller. */ if (request_err == -EPERM) break; udelay(1); /* Re-read the state because some time has passed. */ nbcon_state_read(con, cur); } /* Timed out or aborted. Carefully remove handover request. */ do { /* * No need to remove request if there is a new waiter. This * can only happen if a higher priority context has taken over * the console or the handover request. */ if (!nbcon_waiter_matches(cur, ctxt->prio)) return -EPERM; /* Unset request for handover. */ new.atom = cur->atom; new.req_prio = NBCON_PRIO_NONE; if (nbcon_state_try_cmpxchg(con, cur, &new)) { /* * Request successfully unset. Report failure of * acquiring via handover. */ cur->atom = new.atom; return request_err; } /* * Unable to remove request. Try to acquire in case * the owner has released the lock. */ } while (nbcon_context_try_acquire_requested(ctxt, cur)); /* Lucky timing. The acquire succeeded while removing the request. */ return 0; } /** * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover * @ctxt: The context of the caller * @cur: The current console state * * Acquire the console even in the unsafe state. * * It can be permitted by setting the 'allow_unsafe_takeover' field only * by the final attempt to flush messages in panic(). * * Return: 0 on success. -EPERM when not allowed by the context. */ static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state new; if (!ctxt->allow_unsafe_takeover) return -EPERM; /* Ensure caller is allowed to perform unsafe hostile takeovers. */ if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC)) return -EPERM; /* * Check that try_acquire_direct() and try_acquire_handover() returned * -EBUSY in the right situation. */ WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(cur->unsafe != true); do { new.atom = cur->atom; new.cpu = cpu; new.prio = ctxt->prio; new.unsafe |= cur->unsafe_takeover; new.unsafe_takeover |= cur->unsafe; } while (!nbcon_state_try_cmpxchg(con, cur, &new)); return 0; } static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console * @ctxt: The context of the caller * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. * * If the caller allowed an unsafe hostile takeover, on success the * caller should check the current console state to see if it is * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; nbcon_state_read(con, &cur); try_again: err = nbcon_context_try_acquire_direct(ctxt, &cur); if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_handover(ctxt, &cur); if (err == -EAGAIN) goto try_again; if (err != -EBUSY) goto out; err = nbcon_context_try_acquire_hostile(ctxt, &cur); out: if (err) return false; /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ if (atomic_read(&panic_cpu) == cpu) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; /* Set the record sequence for this context to print. */ ctxt->seq = nbcon_seq_read(ctxt->console); return true; } static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu, int expected_prio) { /* * A similar function, nbcon_waiter_matches(), only deals with * EMERGENCY and PANIC priorities. However, this function must also * deal with the NORMAL priority, which requires additional checks * and constraints. * * For the case where preemption and interrupts are disabled, it is * enough to also verify that the owning CPU has not changed. * * For the case where preemption or interrupts are enabled, an * external synchronization method *must* be used. In particular, * the driver-specific locking mechanism used in device_lock() * (including disabling migration) should be used. It prevents * scenarios such as: * * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and * is scheduled out. * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY * and releases it. * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X] * and is scheduled out. * 4. [Task A] gets running on [CPU X] and sees that the console is * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus * [Task A] thinks it is the owner when it is not. */ if (cur->prio != expected_prio) return false; if (cur->cpu != expected_cpu) return false; return true; } /** * nbcon_context_release - Release the console * @ctxt: The nbcon context from nbcon_context_try_acquire() */ static void nbcon_context_release(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { if (!nbcon_owner_matches(&cur, cpu, ctxt->prio)) break; new.atom = cur.atom; new.prio = NBCON_PRIO_NONE; /* * If @unsafe_takeover is set, it is kept set so that * the state remains permanently unsafe. */ new.unsafe |= cur.unsafe_takeover; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); ctxt->pbufs = NULL; } /** * nbcon_context_can_proceed - Check whether ownership can proceed * @ctxt: The nbcon context from nbcon_context_try_acquire() * @cur: The current console state * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * Must be invoked when entering the unsafe state to make sure that it still * owns the lock. Also must be invoked when exiting the unsafe context * to eventually free the lock for a higher priority context which asked * for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe * state. * * Also it can be called in the safe context before doing an expensive * safe operation. It does not make sense to do the operation when * a higher priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur) { unsigned int cpu = smp_processor_id(); /* Make sure this context still owns the console. */ if (!nbcon_owner_matches(cur, cpu, ctxt->prio)) return false; /* The console owner can proceed if there is no waiter. */ if (cur->req_prio == NBCON_PRIO_NONE) return true; /* * A console owner within an unsafe region is always allowed to * proceed, even if there are waiters. It can perform a handover * when exiting the unsafe region. Otherwise the waiter will * need to perform an unsafe hostile takeover. */ if (cur->unsafe) return true; /* Waiters always have higher priorities than owners. */ WARN_ON_ONCE(cur->req_prio <= cur->prio); /* * Having a safe point for take over and eventually a few * duplicated characters or a full line is way better than a * hostile takeover. Post processing can take care of the garbage. * Release and hand over. */ nbcon_context_release(ctxt); /* * It is not clear whether the waiter really took over ownership. The * outermost callsite must make the final decision whether console * ownership is needed for it to proceed. If yes, it must reacquire * ownership (possibly hostile) before carefully proceeding. * * The calling context no longer owns the console so go back all the * way instead of trying to implement reacquire heuristics in tons of * places. */ return false; } /** * nbcon_can_proceed - Check whether ownership can proceed * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * It is used in nbcon_enter_unsafe() to make sure that it still owns the * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock * for a higher priority context which asked for the friendly handover. * * It can be called inside an unsafe section when the console is just * temporary in safe state instead of exiting and entering the unsafe state. * * Also it can be called in the safe context before doing an expensive safe * operation. It does not make sense to do the operation when a higher * priority context took the lock. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; nbcon_state_read(con, &cur); return nbcon_context_can_proceed(ctxt, &cur); } EXPORT_SYMBOL_GPL(nbcon_can_proceed); #define nbcon_context_enter_unsafe(c) __nbcon_context_update_unsafe(c, true) #define nbcon_context_exit_unsafe(c) __nbcon_context_update_unsafe(c, false) /** * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state * @ctxt: The nbcon context from nbcon_context_try_acquire() * @unsafe: The new value for the unsafe bit * * Return: True if the unsafe state was updated and this context still * owns the console. Otherwise false if ownership was handed * over or taken. * * This function allows console owners to modify the unsafe status of the * console. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. * * Internal helper to avoid duplicated code. */ static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe) { struct console *con = ctxt->console; struct nbcon_state cur; struct nbcon_state new; nbcon_state_read(con, &cur); do { /* * The unsafe bit must not be cleared if an * unsafe hostile takeover has occurred. */ if (!unsafe && cur.unsafe_takeover) goto out; if (!nbcon_context_can_proceed(ctxt, &cur)) return false; new.atom = cur.atom; new.unsafe = unsafe; } while (!nbcon_state_try_cmpxchg(con, &cur, &new)); cur.atom = new.atom; out: return nbcon_context_can_proceed(ctxt, &cur); } static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, char *buf, unsigned int len) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; struct nbcon_state cur; wctxt->outbuf = buf; wctxt->len = len; nbcon_state_read(con, &cur); wctxt->unsafe_takeover = cur.unsafe_takeover; } /** * nbcon_enter_unsafe - Enter an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool is_owner; is_owner = nbcon_context_enter_unsafe(ctxt); if (!is_owner) nbcon_write_context_set_buf(wctxt, NULL, 0); return is_owner; } EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); /** * nbcon_exit_unsafe - Exit an unsafe region in the driver * @wctxt: The write context that was handed to the write function * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. */ bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); bool ret; ret = nbcon_context_exit_unsafe(ctxt); if (!ret) nbcon_write_context_set_buf(wctxt, NULL, 0); return ret; } EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); /** * nbcon_reacquire_nobuf - Reacquire a console after losing ownership * while printing * @wctxt: The write context that was handed to the write callback * * Since ownership can be lost at any time due to handover or takeover, a * printing context _must_ be prepared to back out immediately and * carefully. However, there are scenarios where the printing context must * reacquire ownership in order to finalize or revert hardware changes. * * This function allows a printing context to reacquire ownership using the * same priority as its previous ownership. * * Note that after a successful reacquire the printing context will have no * output buffer because that has been lost. This function cannot be used to * resume printing. */ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); while (!nbcon_context_try_acquire(ctxt)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); } EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); /** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function * @use_atomic: True if the write_atomic() callback is to be used * * Return: True if this context still owns the console. False if * ownership was handed over or taken. * * When this function returns false then the calling context no longer owns * the console and is no longer allowed to go forward. In this case it must * back out immediately and carefully. The buffer content is also no longer * trusted since it no longer belongs to the calling context. If the caller * wants to do more it must reacquire the console first. * * When true is returned, @wctxt->ctxt.backlog indicates whether there are * still records pending in the ringbuffer, */ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; struct printk_message pmsg = { .pbufs = ctxt->pbufs, }; unsigned long con_dropped; struct nbcon_state cur; unsigned long dropped; unsigned long ulseq; /* * This function should never be called for consoles that have not * implemented the necessary callback for writing: i.e. legacy * consoles and, when atomic, nbcon consoles with no write_atomic(). * Handle it as if ownership was lost and try to continue. * * Note that for nbcon consoles the write_thread() callback is * mandatory and was already checked in nbcon_alloc(). */ if (WARN_ON_ONCE((use_atomic && !con->write_atomic) || !(console_srcu_read_flags(con) & CON_NBCON))) { nbcon_context_release(ctxt); return false; } /* * The printk buffers are filled within an unsafe section. This * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from * clobbering each other. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true); if (!ctxt->backlog) return nbcon_context_exit_unsafe(ctxt); /* * @con->dropped is not protected in case of an unsafe hostile * takeover. In that situation the update can be racy so * annotate it accordingly. */ con_dropped = data_race(READ_ONCE(con->dropped)); dropped = con_dropped + pmsg.dropped; if (dropped && !is_extended) console_prepend_dropped(&pmsg, dropped); /* * If the previous owner was assigned the same record, this context * has taken over ownership and is replaying the record. Prepend a * message to let the user know the record is replayed. */ ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { console_prepend_replay(&pmsg); } else { /* * Ensure this context is still the owner before trying to * update @nbcon_prev_seq. Otherwise the value in @ulseq may * not be from the previous owner and instead be some later * value from the context that took over ownership. */ nbcon_state_read(con, &cur); if (!nbcon_context_can_proceed(ctxt, &cur)) return false; atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, __u64seq_to_ulseq(pmsg.seq)); } if (!nbcon_context_exit_unsafe(ctxt)) return false; /* For skipped records just update seq/dropped in @con. */ if (pmsg.outbuf_len == 0) goto update_con; /* Initialize the write context for driver callbacks. */ nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); if (use_atomic) con->write_atomic(con, wctxt); else con->write_thread(con, wctxt); if (!wctxt->outbuf) { /* * Ownership was lost and reacquired by the driver. Handle it * as if ownership was lost. */ nbcon_context_release(ctxt); return false; } /* * Ownership may have been lost but _not_ reacquired by the driver. * This case is detected and handled when entering unsafe to update * dropped/seq values. */ /* * Since any dropped message was successfully output, reset the * dropped count for the console. */ dropped = 0; update_con: /* * The dropped count and the sequence number are updated within an * unsafe section. This limits update races to the panic context and * allows the panic context to win. */ if (!nbcon_context_enter_unsafe(ctxt)) return false; if (dropped != con_dropped) { /* Counterpart to the READ_ONCE() above. */ WRITE_ONCE(con->dropped, dropped); } nbcon_seq_try_update(ctxt, pmsg.seq + 1); return nbcon_context_exit_unsafe(ctxt); } /* * nbcon_emit_one - Print one record for an nbcon console using the * specified callback * @wctxt: An initialized write context struct to use for this context * @use_atomic: True if the write_atomic() callback is to be used * * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This is an internal helper to handle the locking of the console before * calling nbcon_emit_next_record(). */ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; unsigned long flags; bool ret = false; if (!use_atomic) { con->device_lock(con, &flags); /* * Ensure this stays on the CPU to make handover and * takeover possible. */ cant_migrate(); } if (!nbcon_context_try_acquire(ctxt)) goto out; /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no * longer valid. * * The higher priority printing context takes over responsibility * to print the pending records. */ if (!nbcon_emit_next_record(wctxt, use_atomic)) goto out; nbcon_context_release(ctxt); ret = ctxt->backlog; out: if (!use_atomic) con->device_unlock(con, flags); return ret; } /** * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup * @con: Console to operate on * @ctxt: The nbcon context from nbcon_context_try_acquire() * * Return: True if the thread should shutdown or if the console is * allowed to print and a record is available. False otherwise. * * After the thread wakes up, it must first check if it should shutdown before * attempting any printing. */ static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) { bool ret = false; short flags; int cookie; if (kthread_should_stop()) return true; cookie = console_srcu_read_lock(); flags = console_srcu_read_flags(con); if (console_is_usable(con, flags, false)) { /* Bring the sequence in @ctxt up to date */ ctxt->seq = nbcon_seq_read(con); ret = prb_read_valid(prb, ctxt->seq, NULL); } console_srcu_read_unlock(cookie); return ret; } /** * nbcon_kthread_func - The printer thread function * @__console: Console to operate on * * Return: 0 */ static int nbcon_kthread_func(void *__console) { struct console *con = __console; struct nbcon_write_context wctxt = { .ctxt.console = con, .ctxt.prio = NBCON_PRIO_NORMAL, }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); short con_flags; bool backlog; int cookie; wait_for_event: /* * Guarantee this task is visible on the rcuwait before * checking the wake condition. * * The full memory barrier within set_current_state() of * ___rcuwait_wait_event() pairs with the full memory * barrier within rcuwait_has_sleeper(). * * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. */ rcuwait_wait_event(&con->rcuwait, nbcon_kthread_should_wakeup(con, ctxt), TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ do { if (kthread_should_stop()) return 0; backlog = false; /* * Keep the srcu read lock around the entire operation so that * synchronize_srcu() can guarantee that the kthread stopped * or suspended printing. */ cookie = console_srcu_read_lock(); con_flags = console_srcu_read_flags(con); if (console_is_usable(con, con_flags, false)) backlog = nbcon_emit_one(&wctxt, false); console_srcu_read_unlock(cookie); cond_resched(); } while (backlog); goto wait_for_event; } /** * nbcon_irq_work - irq work to wake console printer thread * @irq_work: The irq work to operate on */ static void nbcon_irq_work(struct irq_work *irq_work) { struct console *con = container_of(irq_work, struct console, irq_work); nbcon_kthread_wake(con); } static inline bool rcuwait_has_sleeper(struct rcuwait *w) { /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the rcuwait is empty. * * This full memory barrier pairs with the full memory barrier within * set_current_state() of ___rcuwait_wait_event(), which is called * after prepare_to_rcuwait() adds the waiter but before it has * checked the wait condition. * * This pairs with nbcon_kthread_func:A. */ smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ return rcuwait_active(w); } /** * nbcon_kthreads_wake - Wake up printing threads using irq_work */ void nbcon_kthreads_wake(void) { struct console *con; int cookie; if (!printk_kthreads_running) return; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { if (!(console_srcu_read_flags(con) & CON_NBCON)) continue; /* * Only schedule irq_work if the printing thread is * actively waiting. If not waiting, the thread will * notice by itself that it has work to do. */ if (rcuwait_has_sleeper(&con->rcuwait)) irq_work_queue(&con->irq_work); } console_srcu_read_unlock(cookie); } /* * nbcon_kthread_stop - Stop a console printer thread * @con: Console to operate on */ void nbcon_kthread_stop(struct console *con) { lockdep_assert_console_list_lock_held(); if (!con->kthread) return; kthread_stop(con->kthread); con->kthread = NULL; } /** * nbcon_kthread_create - Create a console printer thread * @con: Console to operate on * * Return: True if the kthread was started or already exists. * Otherwise false and @con must not be registered. * * This function is called when it will be expected that nbcon consoles are * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL * will be no longer flushed by the legacy loop. This is why failure must * be fatal for console registration. * * If @con was already registered and this function fails, @con must be * unregistered before the global state variable @printk_kthreads_running * can be set. */ bool nbcon_kthread_create(struct console *con) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); if (con->kthread) return true; kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); if (WARN_ON(IS_ERR(kt))) { con_printk(KERN_ERR, con, "failed to start printing thread\n"); return false; } con->kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(con->kthread, -20); return true; } /* Track the nbcon emergency nesting per CPU. */ static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; /** * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer * * Context: For reading, any context. For writing, any context which could * not be migrated to another CPU. * Return: Either a pointer to the per CPU emergency nesting counter of * the current CPU or to the init data during early boot. * * The function is safe for reading per-CPU variables in any context because * preemption is disabled if the current CPU is in the emergency state. See * also nbcon_cpu_emergency_enter(). */ static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) { /* * The value of __printk_percpu_data_ready gets set in normal * context and before SMP initialization. As a result it could * never change while inside an nbcon emergency section. */ if (!printk_percpu_data_ready()) return &early_nbcon_pcpu_emergency_nesting; return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting); } /** * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon * printing on the current CPU * * Context: Any context. * Return: The nbcon_prio to use for acquiring an nbcon console in this * context for printing. * * The function is safe for reading per-CPU data in any context because * preemption is disabled if the current CPU is in the emergency or panic * state. */ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; if (this_cpu_in_panic()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (*cpu_emergency_nesting) return NBCON_PRIO_EMERGENCY; return NBCON_PRIO_NORMAL; } /** * nbcon_legacy_emit_next_record - Print one record for an nbcon console * in legacy contexts * @con: The console to print on * @handover: Will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding * both the console_lock and the SRCU read lock. Otherwise it * is set to false. * @cookie: The cookie from the SRCU read lock. * @use_atomic: Set true when called in an atomic or unknown context. * It affects which nbcon callback will be used: write_atomic() * or write_thread(). * * When false, the write_thread() callback is used and would be * called in a preemtible context unless disabled by the * device_lock. The legacy handover is not allowed in this mode. * * Context: Any context except NMI. * Return: True, when a record has been printed and there are still * pending records. The caller might want to continue flushing. * * False, when there is no pending record, or when the console * context cannot be acquired, or the ownership has been lost. * The caller should give up. Either the job is done, cannot be * done, or will be handled by the owning context. * * This function is meant to be called by console_flush_all() to print records * on nbcon consoles from legacy context (printing via console unlocking). * Essentially it is the nbcon version of console_emit_next_record(). */ bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, int cookie, bool use_atomic) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); unsigned long flags; bool progress; ctxt->console = con; ctxt->prio = nbcon_get_default_prio(); if (use_atomic) { /* * In an atomic or unknown context, use the same procedure as * in console_emit_next_record(). It allows to handover. */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); stop_critical_timings(); } progress = nbcon_emit_one(&wctxt, use_atomic); if (use_atomic) { start_critical_timings(); *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } else { /* Non-atomic does not perform legacy spinning handovers. */ *handover = false; } return progress; } /** * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on * failure. * * Errors: * * -EPERM: Unable to acquire console ownership. * * -EAGAIN: Another context took over ownership while printing. * * -ENOENT: A record before @stop_seq is not available. * * If flushing up to @stop_seq was not successful, it only makes sense for the * caller to try again when -EAGAIN was returned. When -EPERM is returned, * this context is not allowed to acquire the console. When -ENOENT is * returned, it cannot be expected that the unfinalized record will become * available. */ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, bool allow_unsafe_takeover) { struct nbcon_write_context wctxt = { }; struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); int err = 0; ctxt->console = con; ctxt->spinwait_max_us = 2000; ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = allow_unsafe_takeover; if (!nbcon_context_try_acquire(ctxt)) return -EPERM; while (nbcon_seq_read(con) < stop_seq) { /* * nbcon_emit_next_record() returns false when the console was * handed over or taken over. In both cases the context is no * longer valid. */ if (!nbcon_emit_next_record(&wctxt, true)) return -EAGAIN; if (!ctxt->backlog) { /* Are there reserved but not yet finalized records? */ if (nbcon_seq_read(con) < stop_seq) err = -ENOENT; break; } } nbcon_context_release(ctxt); return err; } /** * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its * write_atomic() callback * @con: The nbcon console to flush * @stop_seq: Flush up until this record * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers * * This will stop flushing before @stop_seq if another context has ownership. * That context is then responsible for the flushing. Likewise, if new records * are added while this context was flushing and there is no other context * to handle the printing, this context must also flush those records. */ static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, bool allow_unsafe_takeover) { struct console_flush_type ft; unsigned long flags; int err; again: /* * Atomic flushing does not use console driver synchronization (i.e. * it does not hold the port lock for uart consoles). Therefore IRQs * must be disabled to avoid being interrupted and then calling into * a driver that will deadlock trying to acquire console ownership. */ local_irq_save(flags); err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); local_irq_restore(flags); /* * If there was a new owner (-EPERM, -EAGAIN), that context is * responsible for completing. * * Do not wait for records not yet finalized (-ENOENT) to avoid a * possible deadlock. They will either get flushed by the writer or * eventually skipped on panic CPU. */ if (err) return; /* * If flushing was successful but more records are available, this * context must flush those remaining records if the printer thread * is not available do it. */ printk_get_console_flush_type(&ft); if (!ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { stop_seq = prb_next_reserve_seq(prb); goto again; } } /** * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * @stop_seq: Flush up until this record * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers */ static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) { struct console *con; int cookie; cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); if (!(flags & CON_NBCON)) continue; if (!console_is_usable(con, flags, true)) continue; if (nbcon_seq_read(con) >= stop_seq) continue; nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); } console_srcu_read_unlock(cookie); } /** * nbcon_atomic_flush_pending - Flush all nbcon consoles using their * write_atomic() callback * * Flush the backlog up through the currently newest record. Any new * records added while flushing will not be flushed if there is another * context available to handle the flushing. This is to avoid one CPU * printing unbounded because other CPUs continue to add records. */ void nbcon_atomic_flush_pending(void) { __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); } /** * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their * write_atomic() callback and allowing unsafe hostile takeovers * * Flush the backlog up through the currently newest record. Unsafe hostile * takeovers will be performed, if necessary. */ void nbcon_atomic_flush_unsafe(void) { __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); } /** * nbcon_cpu_emergency_enter - Enter an emergency section where printk() * messages for that CPU are flushed directly * * Context: Any context. Disables preemption. * * When within an emergency section, printk() calls will attempt to flush any * pending messages in the ringbuffer. */ void nbcon_cpu_emergency_enter(void) { unsigned int *cpu_emergency_nesting; preempt_disable(); cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; } /** * nbcon_cpu_emergency_exit - Exit an emergency section * * Context: Within an emergency section. Enables preemption. */ void nbcon_cpu_emergency_exit(void) { unsigned int *cpu_emergency_nesting; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; preempt_enable(); } /** * nbcon_alloc - Allocate and init the nbcon console specific data * @con: Console to initialize * * Return: True if the console was fully allocated and initialized. * Otherwise @con must not be registered. * * When allocation and init was successful, the console must be properly * freed using nbcon_free() once it is no longer needed. */ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; rcuwait_init(&con->rcuwait); init_irq_work(&con->irq_work, nbcon_irq_work); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); nbcon_state_set(con, &state); /* * Initialize @nbcon_seq to the highest possible sequence number so * that practically speaking it will have nothing to print until a * desired initial sequence number has been set via nbcon_seq_force(). */ atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb)); if (con->flags & CON_BOOT) { /* * Boot console printing is synchronized with legacy console * printing, so boot consoles can share the same global printk * buffers. */ con->pbufs = &printk_shared_pbufs; } else { con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); if (!con->pbufs) { con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); return false; } if (printk_kthreads_running) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } } } return true; } /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; if (printk_kthreads_running) nbcon_kthread_stop(con); nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ if (!(con->flags & CON_BOOT)) kfree(con->pbufs); con->pbufs = NULL; } /** * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe * section * @con: The nbcon console to acquire * * Context: Under the locking mechanism implemented in * @con->device_lock() including disabling migration. * Return: True if the console was acquired. False otherwise. * * Console drivers will usually use their own internal synchronization * mechasism to synchronize between console printing and non-printing * activities (such as setting baud rates). However, nbcon console drivers * supporting atomic consoles may also want to mark unsafe sections when * performing non-printing activities in order to synchronize against their * atomic_write() callback. * * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL * and marks it unsafe for handover/takeover. */ bool nbcon_device_try_acquire(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); cant_migrate(); memset(ctxt, 0, sizeof(*ctxt)); ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; if (!nbcon_context_try_acquire(ctxt)) return false; if (!nbcon_context_enter_unsafe(ctxt)) return false; return true; } EXPORT_SYMBOL_GPL(nbcon_device_try_acquire); /** * nbcon_device_release - Exit unsafe section and release the nbcon console * @con: The nbcon console acquired in nbcon_device_try_acquire() */ void nbcon_device_release(struct console *con) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); struct console_flush_type ft; int cookie; if (!nbcon_context_exit_unsafe(ctxt)) return; nbcon_context_release(ctxt); /* * This context must flush any new records added while the console * was locked if the printer thread is not available to do it. The * console_srcu_read_lock must be taken to ensure the console is * usable throughout flushing. */ cookie = console_srcu_read_lock(); printk_get_console_flush_type(&ft); if (console_is_usable(con, console_srcu_read_flags(con), true) && !ft.nbcon_offload && prb_read_valid(prb, nbcon_seq_read(con), NULL)) { /* * If nbcon_atomic flushing is not available, fallback to * using the legacy loop. */ if (ft.nbcon_atomic) { __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); } else if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } else if (ft.legacy_offload) { printk_trigger_flush(); } } console_srcu_read_unlock(cookie); } EXPORT_SYMBOL_GPL(nbcon_device_release);
455 449 50 341 391 342 342 49 50 134 134 134 51 49 50 50 50 50 51 49 49 50 50 50 50 49 50 49 50 1 1 1 1 446 13 439 445 444 446 444 445 445 24 304 445 408 410 410 410 235 132 342 342 342 342 408 342 133 422 396 395 422 411 340 342 23 55 5 399 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 // SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> #include <linux/sched/clock.h> #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> #include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/csd.h> #undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) { free_cpumask_var(cfd->cpumask); return -ENOMEM; } cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } return 0; } int smpcfd_dead_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } int smpcfd_dying_cpu(unsigned int cpu) { /* * The IPIs for the smp-call-function callbacks queued by other * CPUs might arrive late, either due to hardware latencies or * because this CPU disabled interrupts (inside stop-machine) * before the IPIs were sent. So flush out any pending callbacks * explicitly (without waiting for the IPIs to arrive), to * ensure that the outgoing CPU doesn't go offline with work * still pending. */ __flush_smp_call_function_queue(false); irq_work_run(); return 0; } void __init call_function_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); smpcfd_prepare_cpu(smp_processor_id()); } static __always_inline void send_call_function_single_ipi(int cpu) { if (call_function_single_prep_ipi(cpu)) { trace_ipi_send_cpu(cpu, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_single_ipi(cpu); } } static __always_inline void send_call_function_ipi_mask(struct cpumask *mask) { trace_ipi_send_cpumask(mask, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_ipi_mask(mask); } static __always_inline void csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd) { trace_csd_function_entry(func, csd); func(info); trace_csd_function_exit(func, csd); } #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); /* * Parse the csdlock_debug= kernel boot parameter. * * If you need to restore the old "ext" value that once provided * additional debugging information, reapply the following commits: * * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging") * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging") */ static int __init csdlock_debug(char *str) { int ret; unsigned int val = 0; ret = get_option(&str, &val); if (ret) { if (val) static_branch_enable(&csdlock_debug_enabled); else static_branch_disable(&csdlock_debug_enabled); } return 1; } __setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0444); static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ module_param(panic_on_ipistall, int, 0444); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ __this_cpu_write(cur_csd, NULL); return; } __this_cpu_write(cur_csd_func, csd->func); __this_cpu_write(cur_csd_info, csd->info); smp_wmb(); /* func and info before csd. */ __this_cpu_write(cur_csd, csd); smp_mb(); /* Update cur_csd before function call. */ /* Or before unlock, as the case may be. */ } static __always_inline void csd_lock_record(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; csd_type = CSD_TYPE(csd); if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ return -1; } static atomic_t n_csd_lock_stuck; /** * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? * * Returns @true if a CSD-lock acquisition is stuck and has been stuck * long enough for a "non-responsive CSD lock" message to be printed. */ bool csd_lock_is_stuck(void) { return !!atomic_read(&n_csd_lock_stuck); } /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) return true; cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); atomic_dec(&n_csd_lock_stuck); return true; } ts2 = ktime_get_mono_fast_ns(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || csd_lock_timeout_ns == 0)) return false; if (ts0 > ts2) { /* Our own sched_clock went backward; don't blame another CPU. */ ts_delta = ts0 - ts2; pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); *ts1 = ts2; return false; } firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); cpu = csd_lock_wait_getcpu(csd); if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) cpux = 0; else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); (*nmessages)++; if (firsttime) atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering * on underflows when the TSC is out of sync between sockets. */ BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), READ_ONCE(per_cpu(cur_csd_info, cpux))); } else { pr_alert("\tcsd: CSD lock (#%d) %s.\n", *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0)) dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } if (firsttime) dump_stack(); *ts1 = ts2; return false; } /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * * For non-synchronous ipi calls the csd can still be in use by the * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ static void __csd_lock_wait(call_single_data_t *csd) { unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = ktime_get_mono_fast_ns(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } smp_acquire__after_ctrl_dep(); } static __always_inline void csd_lock_wait(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); return; } smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #else static void csd_lock_record(call_single_data_t *csd) { } static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other * fields of the specified call_single_data_t structure: */ smp_wmb(); } static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { /* * We have to check the type of the CSD before queueing it, because * once queued it can have its flags cleared by * flush_smp_call_function_queue() * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; csd = container_of(node, call_single_data_t, node.llist); func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* * The list addition should be visible to the target CPU when it pops * the head of the list to pull the entry off it in the IPI handler * because of normal cache coherency rules implied by the underlying * llist ops. * * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added * to arch code to make it appear to obey cache coherency WRT * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ if (llist_add(node, &per_cpu(call_single_queue, cpu))) send_call_function_single_ipi(cpu); } /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; unsigned long flags; /* * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; } if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } __smp_call_single_queue(cpu, &csd->node.llist); return 0; } /** * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks * * Invoked by arch to handle an IPI for call function single. * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { __flush_smp_call_function_queue(true); } /** * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. * * Flush any pending smp-call-function callbacks queued on this CPU. This is * invoked by the generic IPI handler, as well as by a CPU about to go offline, * to ensure that all pending IPI callbacks are run before it goes completely * offline. * * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; struct llist_head *head; static bool warned; atomic_t *tbt; lockdep_assert_irqs_disabled(); /* Allow waiters to send backtrace NMI from here onwards */ tbt = this_cpu_ptr(&trigger_backtrace); atomic_set_release(tbt, 1); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); /* * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: case CSD_TYPE_IRQ_WORK: pr_warn("IPI callback %pS sent to offline CPU\n", csd->func); break; case CSD_TYPE_TTWU: pr_warn("IPI task-wakeup sent to offline CPU\n"); break; default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); break; } } } /* * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } csd_lock_record(csd); csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { prev = &csd->node.llist; } } if (!entry) return; /* * Second; run all !SYNC callbacks. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { smp_call_func_t func = csd->func; void *info = csd->info; csd_lock_record(csd); csd_unlock(csd); csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } } else { prev = &csd->node.llist; } } /* * Third; only CSD_TYPE_TTWU is left, issue those. */ if (entry) { csd = llist_entry(entry, typeof(*csd), node.llist); csd_do_func(sched_ttwu_pending, entry, csd); } } /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * from task context (idle, migration thread) * * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to * handle queued SMP function calls before scheduling. * * The migration thread has to ensure that an eventually pending wakeup has * been handled before it migrates a task. */ void flush_smp_call_function_queue(void) { unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) return; local_irq_save(flags); /* Get the already pending soft interrupts for RT enabled kernels */ was_pending = local_softirq_pending(); __flush_smp_call_function_queue(true); if (local_softirq_pending()) do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { call_single_data_t *csd; call_single_data_t csd_stack = { .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; /* * prevent preemption and reschedule on another processor, * as well as CPU removal */ this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); csd = &csd_stack; if (!wait) { csd = this_cpu_ptr(&csd_data); csd_lock(csd); } csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); put_cpu(); return err; } EXPORT_SYMBOL(smp_call_function_single); /** * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure * * Like smp_call_function_single(), but the call is asynchonous and * can thus be done from contexts with disabled interrupts. * * The caller passes his own pre-allocated data structure * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * * If the function is called with one csd which has not yet been * processed by previous call to smp_call_function_single_async(), the * function will return immediately with -EBUSY showing that the csd * object is still in progress. * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. * * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; preempt_disable(); if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); out: preempt_enable(); return err; } EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). * * Selection preference: * 1) current cpu if in @mask * 2) any cpu of current node if in @mask * 3) any other online cpu in @mask */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; const struct cpumask *nodemask; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); if (cpumask_test_cpu(cpu, mask)) goto call; /* Try for same node. */ nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) goto call; } /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ cpu = cpumask_any_and(mask, cpu_online_mask); call: ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; } EXPORT_SYMBOL_GPL(smp_call_function_any); /* * Flags to be used as scf_flags argument of smp_call_function_many_cond(). * * %SCF_WAIT: Wait until function execution is completed * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask */ #define SCF_WAIT (1U << 0) #define SCF_RUN_LOCAL (1U << 1) static void smp_call_function_many_cond(const struct cpumask *mask, smp_call_func_t func, void *info, unsigned int scf_flags, smp_cond_func_t cond_func) { int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; bool run_local = false; lockdep_assert_preemption_disabled(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) lockdep_assert_irqs_enabled(); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); /* Check if we need local execution. */ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask)) run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); if (cpu < nr_cpu_ids) run_remote = true; if (run_remote) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); if (cond_func && !cond_func(cpu, info)) { __cpumask_clear_cpu(cpu, cfd->cpumask); continue; } csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } } /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the * provided mask. */ if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1)) send_call_function_ipi_mask(cfd->cpumask_ipi); } if (run_local && (!cond_func || cond_func(this_cpu, info))) { unsigned long flags; local_irq_save(flags); csd_do_func(func, info, NULL); local_irq_restore(flags); } if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } } /** * smp_call_function_many(): Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait * (atomically) until function has completed on other CPUs. If * %SCF_RUN_LOCAL is set, the function will also be run locally * if the local CPU is set in the @cpumask. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL); } EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); } EXPORT_SYMBOL(smp_call_function); /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; EXPORT_SYMBOL(setup_max_cpus); /* * Setup routine for controlling SMP activation * * Command-line option of "nosmp" or "maxcpus=0" will disable SMP * activation entirely (the MPS table probe still happens, though). * * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer * greater than 0, limits the maximum number of CPUs activated in * SMP mode to <NUM>. */ void __weak __init arch_disable_smp_support(void) { } static int __init nosmp(char *str) { setup_max_cpus = 0; arch_disable_smp_support(); return 0; } early_param("nosmp", nosmp); /* this is hard limit */ static int __init nrcpus(char *str) { int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) set_nr_cpu_ids(nr_cpus); return 0; } early_param("nr_cpus", nrcpus); static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); if (setup_max_cpus == 0) arch_disable_smp_support(); return 0; } early_param("maxcpus", maxcpus); #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS) /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); #endif /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1); } /* Called by boot processor to activate the rest. */ void __init smp_init(void) { int num_nodes, num_cpus; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } /* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. * * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) { unsigned int scf_flags = SCF_RUN_LOCAL; if (wait) scf_flags |= SCF_WAIT; preempt_disable(); smp_call_function_many_cond(mask, func, info, scf_flags, cond_func); preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); static void do_nothing(void *unused) { } /** * kick_all_cpus_sync - Force all cpus out of idle * * Used to synchronize the update of pm_idle function pointer. It's * called after the pointer is updated and returns after the dummy * callback function has been executed on all cpus. The execution of * the function can only happen on the remote cpus after they have * left the idle function which had been called via pm_idle function * pointer. So it's guaranteed that nothing uses the previous pointer * anymore. */ void kick_all_cpus_sync(void) { /* Make sure the change is visible before we kick the cpus */ smp_mb(); smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); /** * wake_up_all_idle_cpus - break all cpus out of idle * wake_up_all_idle_cpus try to break all cpus which is in idle state even * including idle polling cpus, for non-idle cpus, we will do nothing * for them. */ void wake_up_all_idle_cpus(void) { int cpu; for_each_possible_cpu(cpu) { preempt_disable(); if (cpu != smp_processor_id() && cpu_online(cpu)) wake_up_if_idle(cpu); preempt_enable(); } } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal * @func: function to call * @data: function's data argument * @ret: return value from @func * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu * pinning in order to support virtualized environments. */ struct smp_call_on_cpu_struct { struct work_struct work; struct completion done; int (*func)(void *); void *data; int ret; int cpu; }; static void smp_call_on_cpu_callback(struct work_struct *work) { struct smp_call_on_cpu_struct *sscs; sscs = container_of(work, struct smp_call_on_cpu_struct, work); if (sscs->cpu >= 0) hypervisor_pin_vcpu(sscs->cpu); sscs->ret = sscs->func(sscs->data); if (sscs->cpu >= 0) hypervisor_pin_vcpu(-1); complete(&sscs->done); } int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); destroy_work_on_stack(&sscs.work); return sscs.ret; } EXPORT_SYMBOL_GPL(smp_call_on_cpu);
733 742 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _KERNEL_PRINTK_RINGBUFFER_H #define _KERNEL_PRINTK_RINGBUFFER_H #include <linux/atomic.h> #include <linux/bits.h> #include <linux/dev_printk.h> #include <linux/stddef.h> #include <linux/types.h> /* * Meta information about each stored message. * * All fields are set by the printk code except for @seq, which is * set by the ringbuffer code. */ struct printk_info { u64 seq; /* sequence number */ u64 ts_nsec; /* timestamp in nanoseconds */ u16 text_len; /* length of text message */ u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ u32 caller_id; /* thread id or processor id */ struct dev_printk_info dev_info; }; /* * A structure providing the buffers, used by writers and readers. * * Writers: * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to * buffers reserved for that writer. * * Readers: * Using prb_rec_init_rd(), a reader sets all fields before calling * prb_read_valid(). Note that the reader provides the @info and @text_buf, * buffers. On success, the struct pointed to by @info will be filled and * the char array pointed to by @text_buf will be filled with text data. */ struct printk_record { struct printk_info *info; char *text_buf; unsigned int text_buf_size; }; /* Specifies the logical position and span of a data block. */ struct prb_data_blk_lpos { unsigned long begin; unsigned long next; }; /* * A descriptor: the complete meta-data for a record. * * @state_var: A bitwise combination of descriptor ID and descriptor state. */ struct prb_desc { atomic_long_t state_var; struct prb_data_blk_lpos text_blk_lpos; }; /* A ringbuffer of "ID + data" elements. */ struct prb_data_ring { unsigned int size_bits; char *data; atomic_long_t head_lpos; atomic_long_t tail_lpos; }; /* A ringbuffer of "struct prb_desc" elements. */ struct prb_desc_ring { unsigned int count_bits; struct prb_desc *descs; struct printk_info *infos; atomic_long_t head_id; atomic_long_t tail_id; atomic_long_t last_finalized_seq; }; /* * The high level structure representing the printk ringbuffer. * * @fail: Count of failed prb_reserve() calls where not even a data-less * record was created. */ struct printk_ringbuffer { struct prb_desc_ring desc_ring; struct prb_data_ring text_data_ring; atomic_long_t fail; }; /* * Used by writers as a reserve/commit handle. * * @rb: Ringbuffer where the entry is reserved. * @irqflags: Saved irq flags to restore on entry commit. * @id: ID of the reserved descriptor. * @text_space: Total occupied buffer space in the text data ring, including * ID, alignment padding, and wrapping data blocks. * * This structure is an opaque handle for writers. Its contents are only * to be used by the ringbuffer implementation. */ struct prb_reserved_entry { struct printk_ringbuffer *rb; unsigned long irqflags; unsigned long id; unsigned int text_space; }; /* The possible responses of a descriptor state-query. */ enum desc_state { desc_miss = -1, /* ID mismatch (pseudo state) */ desc_reserved = 0x0, /* reserved, in use by writer */ desc_committed = 0x1, /* committed by writer, could get reopened */ desc_finalized = 0x2, /* committed, no further modification allowed */ desc_reusable = 0x3, /* free, not yet used by any writer */ }; #define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) #define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) #define DESC_SV_BITS BITS_PER_LONG #define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) #define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) #define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) #define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) #define DESC_ID_MASK (~DESC_FLAGS_MASK) #define DESC_ID(sv) ((sv) & DESC_ID_MASK) /* * Special data block logical position values (for fields of * @prb_desc.text_blk_lpos). * * - Bit0 is used to identify if the record has no data block. (Implemented in * the LPOS_DATALESS() macro.) * * - Bit1 specifies the reason for not having a data block. * * These special values could never be real lpos values because of the * meta data and alignment padding of data blocks. (See to_blk_size() for * details.) */ #define FAILED_LPOS 0x1 #define EMPTY_LINE_LPOS 0x3 #define FAILED_BLK_LPOS \ { \ .begin = FAILED_LPOS, \ .next = FAILED_LPOS, \ } /* * Descriptor Bootstrap * * The descriptor array is minimally initialized to allow immediate usage * by readers and writers. The requirements that the descriptor array * initialization must satisfy: * * Req1 * The tail must point to an existing (committed or reusable) descriptor. * This is required by the implementation of prb_first_seq(). * * Req2 * Readers must see that the ringbuffer is initially empty. * * Req3 * The first record reserved by a writer is assigned sequence number 0. * * To satisfy Req1, the tail initially points to a descriptor that is * minimally initialized (having no data block, i.e. data-less with the * data block's lpos @begin and @next values set to FAILED_LPOS). * * To satisfy Req2, the initial tail descriptor is initialized to the * reusable state. Readers recognize reusable descriptors as existing * records, but skip over them. * * To satisfy Req3, the last descriptor in the array is used as the initial * head (and tail) descriptor. This allows the first record reserved by a * writer (head + 1) to be the first descriptor in the array. (Only the first * descriptor in the array could have a valid sequence number of 0.) * * The first time a descriptor is reserved, it is assigned a sequence number * with the value of the array index. A "first time reserved" descriptor can * be recognized because it has a sequence number of 0 but does not have an * index of 0. (Only the first descriptor in the array could have a valid * sequence number of 0.) After the first reservation, all future reservations * (recycling) simply involve incrementing the sequence number by the array * count. * * Hack #1 * Only the first descriptor in the array is allowed to have the sequence * number 0. In this case it is not possible to recognize if it is being * reserved the first time (set to index value) or has been reserved * previously (increment by the array count). This is handled by _always_ * incrementing the sequence number by the array count when reserving the * first descriptor in the array. In order to satisfy Req3, the sequence * number of the first descriptor in the array is initialized to minus * the array count. Then, upon the first reservation, it is incremented * to 0, thus satisfying Req3. * * Hack #2 * prb_first_seq() can be called at any time by readers to retrieve the * sequence number of the tail descriptor. However, due to Req2 and Req3, * initially there are no records to report the sequence number of * (sequence numbers are u64 and there is nothing less than 0). To handle * this, the sequence number of the initial tail descriptor is initialized * to 0. Technically this is incorrect, because there is no record with * sequence number 0 (yet) and the tail descriptor is not the first * descriptor in the array. But it allows prb_read_valid() to correctly * report the existence of a record for _any_ given sequence number at all * times. Bootstrapping is complete when the tail is pushed the first * time, thus finally pointing to the first descriptor reserved by a * writer, which has the assigned sequence number 0. */ /* * Initiating Logical Value Overflows * * Both logical position (lpos) and ID values can be mapped to array indexes * but may experience overflows during the lifetime of the system. To ensure * that printk_ringbuffer can handle the overflows for these types, initial * values are chosen that map to the correct initial array indexes, but will * result in overflows soon. * * BLK0_LPOS * The initial @head_lpos and @tail_lpos for data rings. It is at index * 0 and the lpos value is such that it will overflow on the first wrap. * * DESC0_ID * The initial @head_id and @tail_id for the desc ring. It is at the last * index of the descriptor array (see Req3 above) and the ID value is such * that it will overflow on the second wrap. */ #define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) #define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) #define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) /* * Define a ringbuffer with an external text data buffer. The same as * DEFINE_PRINTKRB() but requires specifying an external buffer for the * text data. * * Note: The specified external buffer must be of the size: * 2 ^ (descbits + avgtextbits) */ #define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ /* reusable */ \ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ /* no associated data block */ \ .text_blk_lpos = FAILED_BLK_LPOS, \ }, \ }; \ static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ /* this will be the first record reserved by a writer */ \ [0] = { \ /* will be incremented to 0 on the first reservation */ \ .seq = -(u64)_DESCS_COUNT(descbits), \ }, \ /* the initial head and tail */ \ [_DESCS_COUNT(descbits) - 1] = { \ /* reports the first seq value during the bootstrap phase */ \ .seq = 0, \ }, \ }; \ static struct printk_ringbuffer name = { \ .desc_ring = { \ .count_bits = descbits, \ .descs = &_##name##_descs[0], \ .infos = &_##name##_infos[0], \ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ .last_finalized_seq = ATOMIC_INIT(0), \ }, \ .text_data_ring = { \ .size_bits = (avgtextbits) + (descbits), \ .data = text_buf, \ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ }, \ .fail = ATOMIC_LONG_INIT(0), \ } /** * DEFINE_PRINTKRB() - Define a ringbuffer. * * @name: The name of the ringbuffer variable. * @descbits: The number of descriptors as a power-of-2 value. * @avgtextbits: The average text data size per record as a power-of-2 value. * * This is a macro for defining a ringbuffer and all internal structures * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a * variant where the text data buffer can be specified externally. */ #define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ __aligned(__alignof__(unsigned long)); \ _DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) /* Writer Interface */ /** * prb_rec_init_wr() - Initialize a buffer for writing records. * * @r: The record to initialize. * @text_buf_size: The needed text buffer size. */ static inline void prb_rec_init_wr(struct printk_record *r, unsigned int text_buf_size) { r->info = NULL; r->text_buf = NULL; r->text_buf_size = text_buf_size; } bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r); bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id, unsigned int max_size); void prb_commit(struct prb_reserved_entry *e); void prb_final_commit(struct prb_reserved_entry *e); void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int text_buf_size, struct prb_desc *descs, unsigned int descs_count_bits, struct printk_info *infos); unsigned int prb_record_text_space(struct prb_reserved_entry *e); /* Reader Interface */ /** * prb_rec_init_rd() - Initialize a buffer for reading records. * * @r: The record to initialize. * @info: A buffer to store record meta-data. * @text_buf: A buffer to store text data. * @text_buf_size: The size of @text_buf. * * Initialize all the fields that a reader is interested in. All arguments * (except @r) are optional. Only record data for arguments that are * non-NULL or non-zero will be read. */ static inline void prb_rec_init_rd(struct printk_record *r, struct printk_info *info, char *text_buf, unsigned int text_buf_size) { r->info = info; r->text_buf = text_buf; r->text_buf_size = text_buf_size; } /** * prb_for_each_record() - Iterate over the records of a ringbuffer. * * @from: The sequence number to begin with. * @rb: The ringbuffer to iterate over. * @s: A u64 to store the sequence number on each iteration. * @r: A printk_record to store the record on each iteration. * * This is a macro for conveniently iterating over a ringbuffer. * Note that @s may not be the sequence number of the record on each * iteration. For the sequence number, @r->info->seq should be checked. * * Context: Any context. */ #define prb_for_each_record(from, rb, s, r) \ for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) /** * prb_for_each_info() - Iterate over the meta data of a ringbuffer. * * @from: The sequence number to begin with. * @rb: The ringbuffer to iterate over. * @s: A u64 to store the sequence number on each iteration. * @i: A printk_info to store the record meta data on each iteration. * @lc: An unsigned int to store the text line count of each record. * * This is a macro for conveniently iterating over a ringbuffer. * Note that @s may not be the sequence number of the record on each * iteration. For the sequence number, @r->info->seq should be checked. * * Context: Any context. */ #define prb_for_each_info(from, rb, s, i, lc) \ for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r); bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count); u64 prb_first_seq(struct printk_ringbuffer *rb); u64 prb_first_valid_seq(struct printk_ringbuffer *rb); u64 prb_next_seq(struct printk_ringbuffer *rb); u64 prb_next_reserve_seq(struct printk_ringbuffer *rb); #ifdef CONFIG_64BIT #define __u64seq_to_ulseq(u64seq) (u64seq) #define __ulseq_to_u64seq(rb, ulseq) (ulseq) #define ULSEQ_MAX(rb) (-1) #else /* CONFIG_64BIT */ #define __u64seq_to_ulseq(u64seq) ((u32)u64seq) #define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL) static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq) { u64 rb_first_seq = prb_first_seq(rb); u64 seq; /* * The provided sequence is only the lower 32 bits of the ringbuffer * sequence. It needs to be expanded to 64bit. Get the first sequence * number from the ringbuffer and fold it. * * Having a 32bit representation in the console is sufficient. * If a console ever gets more than 2^31 records behind * the ringbuffer then this is the least of the problems. * * Also the access to the ring buffer is always safe. */ seq = rb_first_seq - (s32)((u32)rb_first_seq - ulseq); return seq; } #endif /* CONFIG_64BIT */ #endif /* _KERNEL_PRINTK_RINGBUFFER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> /* * 32 bit structures for IA32 support. */ #include <uapi/asm/sigcontext.h> /* signal.h */ struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; struct sigcontext_32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; /* This matches struct stat64 in glibc2.2, hence the absolutely * insane amounts of padding around dev_t's. */ struct stat64 { unsigned long long st_dev; unsigned char __pad0[4]; #define STAT64_HAS_BROKEN_ST_INO 1 unsigned int __st_ino; unsigned int st_mode; unsigned int st_nlink; unsigned int st_uid; unsigned int st_gid; unsigned long long st_rdev; unsigned char __pad3[4]; long long st_size; unsigned int st_blksize; long long st_blocks;/* Number 512-byte blocks allocated */ unsigned st_atime; unsigned st_atime_nsec; unsigned st_mtime; unsigned st_mtime_nsec; unsigned st_ctime; unsigned st_ctime_nsec; unsigned long long st_ino; } __attribute__((packed)); extern bool __ia32_enabled; static __always_inline bool ia32_enabled(void) { return __ia32_enabled; } static inline void ia32_disable(void) { __ia32_enabled = false; } #else /* !CONFIG_IA32_EMULATION */ static __always_inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } static inline void ia32_disable(void) {} #endif static inline bool ia32_enabled_verbose(void) { bool enabled = ia32_enabled(); if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); return enabled; } #endif /* _ASM_X86_IA32_H */
1694 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") ); TRACE_EVENT(mm_lru_activate, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); ), TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
292 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 // SPDX-License-Identifier: GPL-2.0-or-later /* Paravirtualization interfaces Copyright (C) 2006 Rusty Russell IBM Corporation 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ #include <linux/errno.h> #include <linux/init.h> #include <linux/export.h> #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> #include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> #include <asm/delay.h> #include <asm/fixmap.h> #include <asm/apic.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } #ifdef CONFIG_PARAVIRT_XXL DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_enable(&virt_spin_lock_key); } static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) { tlb_remove_page(tlb, table); } struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { return 0; } DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)) { static_call_update(pv_sched_clock, func); } /* These are in entry.S */ static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, .name = "paravirt-ioport", .flags = IORESOURCE_IO | IORESOURCE_BUSY, }; /* * Reserve the whole legacy IO space to prevent any legacy drivers * from wasting time probing for their hardware. This is a fairly * brute-force approach to disabling all non-virtual drivers. * * Note that this must be called very early to have any effect. */ int paravirt_disable_iospace(void) { return request_resource(&ioport_resource, &reserve_ioports); } #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); } static noinstr unsigned long pv_native_get_debugreg(int regno) { return native_get_debugreg(regno); } static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } noinstr void pv_native_wbinvd(void) { native_wbinvd(); } static noinstr void pv_native_safe_halt(void) { native_safe_halt(); } #endif struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL .extra_user_64bit_cs = __USER_CS, #endif }; /* 64-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { /* Cpu ops. */ .cpu.io_delay = native_io_delay, #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, .cpu.get_debugreg = pv_native_get_debugreg, .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, .cpu.wbinvd = pv_native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, .cpu.write_msr_safe = native_write_msr_safe, .cpu.read_pmc = native_read_pmc, .cpu.load_tr_desc = native_load_tr_desc, .cpu.set_ldt = native_set_ldt, .cpu.load_gdt = native_load_gdt, .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, .cpu.load_gs_index = native_load_gs_index, .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, .cpu.alloc_ldt = paravirt_nop, .cpu.free_ldt = paravirt_nop, .cpu.load_sp0 = native_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, .mmu.tlb_remove_table = native_tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, .mmu.alloc_pte = paravirt_nop, .mmu.alloc_pmd = paravirt_nop, .mmu.alloc_pud = paravirt_nop, .mmu.alloc_p4d = paravirt_nop, .mmu.release_pte = paravirt_nop, .mmu.release_pmd = paravirt_nop, .mmu.release_pud = paravirt_nop, .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, .mmu.set_p4d = native_set_p4d, #if CONFIG_PGTABLE_LEVELS >= 5 .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, .mmu.make_pte = PTE_IDENT, .mmu.make_pgd = PTE_IDENT, .mmu.enter_mmap = paravirt_nop, .mmu.lazy_mode = { .enter = paravirt_nop, .leave = paravirt_nop, .flush = paravirt_nop, }, .mmu.set_fixmap = native_set_fixmap, #endif /* CONFIG_PARAVIRT_XXL */ #if defined(CONFIG_PARAVIRT_SPINLOCKS) /* Lock ops. */ #ifdef CONFIG_SMP .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .lock.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .lock.wait = paravirt_nop, .lock.kick = paravirt_nop, .lock.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ #endif }; #ifdef CONFIG_PARAVIRT_XXL NOKPROBE_SYMBOL(native_load_idt); #endif EXPORT_SYMBOL(pv_ops); EXPORT_SYMBOL_GPL(pv_info);
2 2 2 2 2 1 1 1 4 4 4 3 3 2 2 2 2 3 3 3 1 1 1 1 5 4 5 3 3 3 4 4 1 1 1 4 3 2 1 4 4 3 2 1 4 60 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 // SPDX-License-Identifier: GPL-2.0-or-later /***************************************************************************** * Linux PPP over L2TP (PPPoX/PPPoL2TP) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoL2TP --- PPP over L2TP (RFC 2661) * * Version: 2.0.0 * * Authors: James Chapman (jchapman@katalix.com) * * Based on original work by Martijn van Oosterhout <kleptog@svana.org> * * License: */ /* This driver handles only L2TP data frames; control frames are handled by a * userspace application. * * To send data in an L2TP session, userspace opens a PPPoL2TP socket and * attaches it to a bound UDP socket with local tunnel_id / session_id and * peer tunnel_id / session_id set. Data can then be sent or received using * regular socket sendmsg() / recvmsg() calls. Kernel parameters of the socket * can be read or modified using ioctl() or [gs]etsockopt() calls. * * When a PPPoL2TP socket is connected with local and peer session_id values * zero, the socket is treated as a special tunnel management socket. * * Here's example userspace code to create a socket for sending/receiving data * over an L2TP session:- * * struct sockaddr_pppol2tp sax; * int fd; * int session_fd; * * fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP); * * sax.sa_family = AF_PPPOX; * sax.sa_protocol = PX_PROTO_OL2TP; * sax.pppol2tp.fd = tunnel_fd; // bound UDP socket * sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr; * sax.pppol2tp.addr.sin_port = addr->sin_port; * sax.pppol2tp.addr.sin_family = AF_INET; * sax.pppol2tp.s_tunnel = tunnel_id; * sax.pppol2tp.s_session = session_id; * sax.pppol2tp.d_tunnel = peer_tunnel_id; * sax.pppol2tp.d_session = peer_session_id; * * session_fd = connect(fd, (struct sockaddr *)&sax, sizeof(sax)); * * A pppd plugin that allows PPP traffic to be carried over L2TP using * this driver is available from the OpenL2TP project at * http://openl2tp.sourceforge.net. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/string.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/if_pppox.h> #include <linux/if_pppol2tp.h> #include <net/sock.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/file.h> #include <linux/hash.h> #include <linux/sort.h> #include <linux/proc_fs.h> #include <linux/l2tp.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip.h> #include <net/udp.h> #include <net/inet_common.h> #include <asm/byteorder.h> #include <linux/atomic.h> #include "l2tp_core.h" #define PPPOL2TP_DRV_VERSION "V2.0" /* Space for UDP, L2TP and PPP headers */ #define PPPOL2TP_HEADER_OVERHEAD 40 /* Number of bytes to build transmit L2TP headers. * Unfortunately the size is different depending on whether sequence numbers * are enabled. */ #define PPPOL2TP_L2TP_HDR_SIZE_SEQ 10 #define PPPOL2TP_L2TP_HDR_SIZE_NOSEQ 6 /* Private data of each session. This data lives at the end of struct * l2tp_session, referenced via session->priv[]. */ struct pppol2tp_session { int owner; /* pid that opened the socket */ struct mutex sk_lock; /* Protects .sk */ struct sock __rcu *sk; /* Pointer to the session PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ }; static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb); static const struct ppp_channel_ops pppol2tp_chan_ops = { .start_xmit = pppol2tp_xmit, }; static const struct proto_ops pppol2tp_ops; /* Retrieves the pppol2tp socket associated to a session. * A reference is held on the returned socket, so this function must be paired * with sock_put(). */ static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); struct sock *sk; rcu_read_lock(); sk = rcu_dereference(ps->sk); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } /* Helpers to obtain tunnel/session contexts from sockets. */ static struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) { struct l2tp_session *session; if (!sk) return NULL; rcu_read_lock(); session = rcu_dereference_sk_user_data(sk); if (session && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock(); WARN_ON_ONCE(session->magic != L2TP_SESSION_MAGIC); return session; } rcu_read_unlock(); return NULL; } /***************************************************************************** * Receive data handling *****************************************************************************/ /* Receive message. This is the recvmsg for the PPPoL2TP socket. */ static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { int err; struct sk_buff *skb; struct sock *sk = sock->sk; err = -EIO; if (sk->sk_state & PPPOX_BOUND) goto end; err = 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto end; if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, 0, msg, len); if (likely(err == 0)) err = len; kfree_skb(skb); end: return err; } static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { struct pppol2tp_session *ps = l2tp_session_priv(session); struct sock *sk = NULL; /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ rcu_read_lock(); sk = rcu_dereference(ps->sk); if (!sk) goto no_sock; /* If the first two bytes are 0xFF03, consider that it is the PPP's * Address and Control fields and skip them. The L2TP module has always * worked this way, although, in theory, the use of these fields should * be negotiated and handled at the PPP layer. These fields are * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered * Information command with Poll/Final bit set to zero (RFC 1662). */ if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS && skb->data[1] == PPP_UI) skb_pull(skb, 2); if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; po = pppox_sk(sk); ppp_input(&po->chan, skb); } else { if (sock_queue_rcv_skb(sk, skb) < 0) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } } rcu_read_unlock(); return; no_sock: rcu_read_unlock(); pr_warn_ratelimited("%s: no socket in recv\n", session->name); kfree_skb(skb); } /************************************************************************ * Transmit handling ***********************************************************************/ /* This is the sendmsg for the PPPoL2TP pppol2tp_session socket. We come here * when a user application does a sendmsg() on the session socket. L2TP and * PPP headers must be inserted into the user's data. */ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; struct sk_buff *skb; int error; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen; error = -ENOTCONN; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto error; /* Get session and tunnel contexts */ error = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto error; tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; /* Allocate a socket buffer */ error = -ENOMEM; skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len + 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) goto error_put_sess; /* Reserve space for headers. */ skb_reserve(skb, NET_SKB_PAD); skb_reset_network_header(skb); skb_reserve(skb, sizeof(struct iphdr)); skb_reset_transport_header(skb); skb_reserve(skb, uhlen); /* Add PPP header */ skb->data[0] = PPP_ALLSTATIONS; skb->data[1] = PPP_UI; skb_put(skb, 2); /* Copy user data into skb */ error = memcpy_from_msg(skb_put(skb, total_len), m, total_len); if (error < 0) { kfree_skb(skb); goto error_put_sess; } local_bh_disable(); l2tp_xmit_skb(session, skb); local_bh_enable(); l2tp_session_put(session); return total_len; error_put_sess: l2tp_session_put(session); error: return error; } /* Transmit function called by generic PPP driver. Sends PPP frame * over PPPoL2TP socket. * * This is almost the same as pppol2tp_sendmsg(), but rather than * being called with a msghdr from userspace, it is called with a skb * from the kernel. * * The supplied skb from ppp doesn't have enough headroom for the * insertion of L2TP, UDP and IP headers so we need to allocate more * headroom in the skb. This will create a cloned skb. But we must be * careful in the error case because the caller will expect to free * the skb it supplied, not our cloned skb. So we take care to always * leave the original skb unfreed if we return an error. */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *)chan->private; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen, headroom; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto abort; /* Get session and tunnel contexts from the socket */ session = pppol2tp_sock_to_session(sk); if (!session) goto abort; tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; headroom = NET_SKB_PAD + sizeof(struct iphdr) + /* IP header */ uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */ session->hdr_len + /* L2TP header */ 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) goto abort_put_sess; /* Setup PPP header */ __skb_push(skb, 2); skb->data[0] = PPP_ALLSTATIONS; skb->data[1] = PPP_UI; local_bh_disable(); l2tp_xmit_skb(session, skb); local_bh_enable(); l2tp_session_put(session); return 1; abort_put_sess: l2tp_session_put(session); abort: /* Free the original skb */ kfree_skb(skb); return 1; } /***************************************************************************** * Session (and tunnel control) socket create/destroy. *****************************************************************************/ /* Really kill the session socket. (Called from sock_put() if * refcnt == 0.) */ static void pppol2tp_session_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void pppol2tp_session_close(struct l2tp_session *session) { struct pppol2tp_session *ps; ps = l2tp_session_priv(session); mutex_lock(&ps->sk_lock); ps->__sk = rcu_dereference_protected(ps->sk, lockdep_is_held(&ps->sk_lock)); RCU_INIT_POINTER(ps->sk, NULL); mutex_unlock(&ps->sk_lock); if (ps->__sk) { /* detach socket */ rcu_assign_sk_user_data(ps->__sk, NULL); sock_put(ps->__sk); /* drop ref taken when we referenced socket via sk_user_data */ l2tp_session_put(session); } } /* Called when the PPPoX socket (session) is closed. */ static int pppol2tp_release(struct socket *sock) { struct sock *sk = sock->sk; struct l2tp_session *session; int error; if (!sk) return 0; error = -EBADF; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) != 0) goto error; pppox_unbind_sock(sk); /* Signal the death of the socket. */ sk->sk_state = PPPOX_DEAD; sock_orphan(sk); sock->sk = NULL; session = pppol2tp_sock_to_session(sk); if (session) { l2tp_session_delete(session); /* drop ref taken by pppol2tp_sock_to_session */ l2tp_session_put(session); } release_sock(sk); sock_put(sk); return 0; error: release_sock(sk); return error; } static struct proto pppol2tp_sk_proto = { .name = "PPPOL2TP", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; rc = l2tp_udp_encap_recv(sk, skb); if (rc) kfree_skb(skb); return NET_RX_SUCCESS; } /* socket() handler. Initialize a new struct sock. */ static int pppol2tp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern); if (!sk) goto out; sock_init_data(sock, sk); sock_set_flag(sk, SOCK_RCU_FREE); sock->state = SS_UNCONNECTED; sock->ops = &pppol2tp_ops; sk->sk_backlog_rcv = pppol2tp_backlog_recv; sk->sk_protocol = PX_PROTO_OL2TP; sk->sk_family = PF_PPPOX; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_destruct = pppol2tp_session_destruct; error = 0; out: return error; } static void pppol2tp_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; struct sock *sk; sk = pppol2tp_session_get_sock(session); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); sock_put(sk); } } static void pppol2tp_session_init(struct l2tp_session *session) { struct pppol2tp_session *ps; session->recv_skb = pppol2tp_recv; session->session_close = pppol2tp_session_close; if (IS_ENABLED(CONFIG_L2TP_DEBUGFS)) session->show = pppol2tp_show; ps = l2tp_session_priv(session); mutex_init(&ps->sk_lock); ps->owner = current->pid; } struct l2tp_connect_info { u8 version; int fd; u32 tunnel_id; u32 peer_tunnel_id; u32 session_id; u32 peer_session_id; }; static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len, struct l2tp_connect_info *info) { switch (sa_len) { case sizeof(struct sockaddr_pppol2tp): { const struct sockaddr_pppol2tp *sa_v2in4 = sa; if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 2; info->fd = sa_v2in4->pppol2tp.fd; info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel; info->session_id = sa_v2in4->pppol2tp.s_session; info->peer_session_id = sa_v2in4->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpv3): { const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa; if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 3; info->fd = sa_v3in4->pppol2tp.fd; info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel; info->session_id = sa_v3in4->pppol2tp.s_session; info->peer_session_id = sa_v3in4->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpin6): { const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa; if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 2; info->fd = sa_v2in6->pppol2tp.fd; info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel; info->session_id = sa_v2in6->pppol2tp.s_session; info->peer_session_id = sa_v2in6->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpv3in6): { const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa; if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 3; info->fd = sa_v3in6->pppol2tp.fd; info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel; info->session_id = sa_v3in6->pppol2tp.s_session; info->peer_session_id = sa_v3in6->pppol2tp.d_session; break; } default: return -EINVAL; } return 0; } /* Rough estimation of the maximum payload size a tunnel can transmit without * fragmenting at the lower IP layer. Assumes L2TPv2 with sequence * numbers and no IP option. Not quite accurate, but the result is mostly * unused anyway. */ static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel) { int mtu; mtu = l2tp_tunnel_dst_mtu(tunnel); if (mtu <= PPPOL2TP_HEADER_OVERHEAD) return 1500 - PPPOL2TP_HEADER_OVERHEAD; return mtu - PPPOL2TP_HEADER_OVERHEAD; } static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, const struct l2tp_connect_info *info, bool *new_tunnel) { struct l2tp_tunnel *tunnel; int error; *new_tunnel = false; tunnel = l2tp_tunnel_get(net, info->tunnel_id); /* Special case: create tunnel context if session_id and * peer_session_id is 0. Otherwise look up tunnel using supplied * tunnel id. */ if (!info->session_id && !info->peer_session_id) { if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, }; /* Prevent l2tp_tunnel_register() from trying to set up * a kernel socket. */ if (info->fd < 0) return ERR_PTR(-EBADF); error = l2tp_tunnel_create(info->fd, info->version, info->tunnel_id, info->peer_tunnel_id, &tcfg, &tunnel); if (error < 0) return ERR_PTR(error); refcount_inc(&tunnel->ref_count); error = l2tp_tunnel_register(tunnel, net, &tcfg); if (error < 0) { kfree(tunnel); return ERR_PTR(error); } *new_tunnel = true; } } else { /* Error if we can't find the tunnel */ if (!tunnel) return ERR_PTR(-ENOENT); /* Error if socket is not prepped */ if (!tunnel->sock) { l2tp_tunnel_put(tunnel); return ERR_PTR(-ENOENT); } } return tunnel; } /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); struct l2tp_session *session = NULL; struct l2tp_connect_info info; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; struct l2tp_session_cfg cfg = { 0, }; bool drop_refcnt = false; bool new_session = false; bool new_tunnel = false; int error; error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info); if (error < 0) return error; /* Don't bind if tunnel_id is 0 */ if (!info.tunnel_id) return -EINVAL; tunnel = pppol2tp_tunnel_get(sock_net(sk), &info, &new_tunnel); if (IS_ERR(tunnel)) return PTR_ERR(tunnel); lock_sock(sk); /* Check for already bound sockets */ error = -EBUSY; if (sk->sk_state & PPPOX_CONNECTED) goto end; /* We don't supporting rebinding anyway */ error = -EALREADY; if (sk->sk_user_data) goto end; /* socket is already attached */ if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = info.peer_tunnel_id; session = l2tp_session_get(sock_net(sk), tunnel->sock, tunnel->version, info.tunnel_id, info.session_id); if (session) { drop_refcnt = true; if (session->pwtype != L2TP_PWTYPE_PPP) { error = -EPROTOTYPE; goto end; } ps = l2tp_session_priv(session); /* Using a pre-existing session is fine as long as it hasn't * been connected yet. */ mutex_lock(&ps->sk_lock); if (rcu_dereference_protected(ps->sk, lockdep_is_held(&ps->sk_lock)) || ps->__sk) { mutex_unlock(&ps->sk_lock); error = -EEXIST; goto end; } } else { cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, info.session_id, info.peer_session_id, &cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto end; } drop_refcnt = true; pppol2tp_session_init(session); ps = l2tp_session_priv(session); refcount_inc(&session->ref_count); mutex_lock(&ps->sk_lock); error = l2tp_session_register(session, tunnel); if (error < 0) { mutex_unlock(&ps->sk_lock); l2tp_session_put(session); goto end; } new_session = true; } /* Special case: if source & dest session_id == 0x0000, this * socket is being created to manage the tunnel. Just set up * the internal context for use by ioctl() and sockopt() * handlers. */ if (session->session_id == 0 && session->peer_session_id == 0) { error = 0; goto out_no_ppp; } /* The only header we need to worry about is the L2TP * header. This size is different depending on whether * sequence numbers are enabled for the data channel. */ po->chan.hdrlen = PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; po->chan.private = sk; po->chan.ops = &pppol2tp_chan_ops; po->chan.mtu = pppol2tp_tunnel_mtu(tunnel); error = ppp_register_net_channel(sock_net(sk), &po->chan); if (error) { mutex_unlock(&ps->sk_lock); goto end; } out_no_ppp: /* This is how we get the session context from the socket. */ sock_hold(sk); rcu_assign_sk_user_data(sk, session); rcu_assign_pointer(ps->sk, sk); mutex_unlock(&ps->sk_lock); /* Keep the reference we've grabbed on the session: sk doesn't expect * the session to disappear. pppol2tp_session_close() is responsible * for dropping it. */ drop_refcnt = false; sk->sk_state = PPPOX_CONNECTED; end: if (error) { if (new_session) l2tp_session_delete(session); if (new_tunnel) l2tp_tunnel_delete(tunnel); } if (drop_refcnt) l2tp_session_put(session); l2tp_tunnel_put(tunnel); release_sock(sk); return error; } #ifdef CONFIG_L2TP_V3 /* Called when creating sessions via the netlink interface. */ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { int error; struct l2tp_session *session; /* Error if tunnel socket is not prepped */ if (!tunnel->sock) { error = -ENOENT; goto err; } /* Allocate and initialize a new session context. */ session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, peer_session_id, cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto err; } pppol2tp_session_init(session); error = l2tp_session_register(session, tunnel); if (error < 0) goto err_sess; return 0; err_sess: l2tp_session_put(session); err: return error; } #endif /* CONFIG_L2TP_V3 */ /* getname() support. */ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = 0; int error = 0; struct l2tp_session *session; struct l2tp_tunnel *tunnel; struct sock *sk = sock->sk; struct inet_sock *inet; struct pppol2tp_session *pls; error = -ENOTCONN; if (!sk) goto end; if (!(sk->sk_state & PPPOX_CONNECTED)) goto end; error = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; pls = l2tp_session_priv(session); tunnel = session->tunnel; inet = inet_sk(tunnel->sock); if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET) { struct sockaddr_pppol2tp sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin_family = AF_INET; sp.pppol2tp.addr.sin_port = inet->inet_dport; sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); #if IS_ENABLED(CONFIG_IPV6) } else if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpin6 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin6_family = AF_INET6; sp.pppol2tp.addr.sin6_port = inet->inet_dport; memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); } else if (tunnel->version == 3 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpv3in6 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin6_family = AF_INET6; sp.pppol2tp.addr.sin6_port = inet->inet_dport; memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); #endif } else if (tunnel->version == 3) { struct sockaddr_pppol2tpv3 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin_family = AF_INET; sp.pppol2tp.addr.sin_port = inet->inet_dport; sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); } error = len; l2tp_session_put(session); end: return error; } /**************************************************************************** * ioctl() handlers. * * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP * sockets. However, in order to control kernel tunnel features, we allow * userspace to create a special "tunnel" PPPoX socket which is used for * control only. Tunnel PPPoX sockets have session_id == 0 and simply allow * the user application to issue L2TP setsockopt(), getsockopt() and ioctl() * calls. ****************************************************************************/ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, const struct l2tp_stats *stats) { memset(dest, 0, sizeof(*dest)); dest->tx_packets = atomic_long_read(&stats->tx_packets); dest->tx_bytes = atomic_long_read(&stats->tx_bytes); dest->tx_errors = atomic_long_read(&stats->tx_errors); dest->rx_packets = atomic_long_read(&stats->rx_packets); dest->rx_bytes = atomic_long_read(&stats->rx_bytes); dest->rx_seq_discards = atomic_long_read(&stats->rx_seq_discards); dest->rx_oos_packets = atomic_long_read(&stats->rx_oos_packets); dest->rx_errors = atomic_long_read(&stats->rx_errors); } static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, struct l2tp_tunnel *tunnel) { struct l2tp_session *session; if (!stats->session_id) { pppol2tp_copy_stats(stats, &tunnel->stats); return 0; } /* If session_id is set, search the corresponding session in the * context of this tunnel and record the session's statistics. */ session = l2tp_session_get(tunnel->l2tp_net, tunnel->sock, tunnel->version, tunnel->tunnel_id, stats->session_id); if (!session) return -EBADR; if (session->pwtype != L2TP_PWTYPE_PPP) { l2tp_session_put(session); return -EBADR; } pppol2tp_copy_stats(stats, &session->stats); l2tp_session_put(session); return 0; } static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct pppol2tp_ioc_stats stats; struct l2tp_session *session; switch (cmd) { case PPPIOCGMRU: case PPPIOCGFLAGS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; if (put_user(0, (int __user *)arg)) return -EFAULT; break; case PPPIOCSMRU: case PPPIOCSFLAGS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; if (!access_ok((int __user *)arg, sizeof(int))) return -EFAULT; break; case PPPIOCGL2TPSTATS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Session 0 represents the parent tunnel */ if (!session->session_id && !session->peer_session_id) { u32 session_id; int err; if (copy_from_user(&stats, (void __user *)arg, sizeof(stats))) return -EFAULT; session_id = stats.session_id; err = pppol2tp_tunnel_copy_stats(&stats, session->tunnel); if (err < 0) return err; stats.session_id = session_id; } else { pppol2tp_copy_stats(&stats, &session->stats); stats.session_id = session->session_id; } stats.tunnel_id = session->tunnel->tunnel_id; stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel); if (copy_to_user((void __user *)arg, &stats, sizeof(stats))) return -EFAULT; break; default: return -ENOIOCTLCMD; } return 0; } /***************************************************************************** * setsockopt() / getsockopt() support. * * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP * sockets. In order to control kernel tunnel features, we allow userspace to * create a special "tunnel" PPPoX socket which is used for control only. * Tunnel PPPoX sockets have session_id == 0 and simply allow the user * application to issue L2TP setsockopt(), getsockopt() and ioctl() calls. *****************************************************************************/ /* Tunnel setsockopt() helper. */ static int pppol2tp_tunnel_setsockopt(struct sock *sk, struct l2tp_tunnel *tunnel, int optname, int val) { int err = 0; switch (optname) { case PPPOL2TP_SO_DEBUG: /* Tunnel debug flags option is deprecated */ break; default: err = -ENOPROTOOPT; break; } return err; } /* Session setsockopt helper. */ static int pppol2tp_session_setsockopt(struct sock *sk, struct l2tp_session *session, int optname, int val) { int err = 0; switch (optname) { case PPPOL2TP_SO_RECVSEQ: if (val != 0 && val != 1) { err = -EINVAL; break; } session->recv_seq = !!val; break; case PPPOL2TP_SO_SENDSEQ: if (val != 0 && val != 1) { err = -EINVAL; break; } session->send_seq = !!val; { struct pppox_sock *po = pppox_sk(sk); po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } l2tp_session_set_header_len(session, session->tunnel->version, session->tunnel->encap); break; case PPPOL2TP_SO_LNSMODE: if (val != 0 && val != 1) { err = -EINVAL; break; } session->lns_mode = !!val; break; case PPPOL2TP_SO_DEBUG: /* Session debug flags option is deprecated */ break; case PPPOL2TP_SO_REORDERTO: session->reorder_timeout = msecs_to_jiffies(val); break; default: err = -ENOPROTOOPT; break; } return err; } /* Main setsockopt() entry point. * Does API checks, then calls either the tunnel or session setsockopt * handler, according to whether the PPPoL2TP socket is a for a regular * session or the special tunnel type. */ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int val; int err; if (level != SOL_PPPOL2TP) return -EINVAL; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; err = -ENOTCONN; if (!sk->sk_user_data) goto end; /* Get session context from the socket */ err = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); } else { err = pppol2tp_session_setsockopt(sk, session, optname, val); } l2tp_session_put(session); end: return err; } /* Tunnel getsockopt helper. Called with sock locked. */ static int pppol2tp_tunnel_getsockopt(struct sock *sk, struct l2tp_tunnel *tunnel, int optname, int *val) { int err = 0; switch (optname) { case PPPOL2TP_SO_DEBUG: /* Tunnel debug flags option is deprecated */ *val = 0; break; default: err = -ENOPROTOOPT; break; } return err; } /* Session getsockopt helper. Called with sock locked. */ static int pppol2tp_session_getsockopt(struct sock *sk, struct l2tp_session *session, int optname, int *val) { int err = 0; switch (optname) { case PPPOL2TP_SO_RECVSEQ: *val = session->recv_seq; break; case PPPOL2TP_SO_SENDSEQ: *val = session->send_seq; break; case PPPOL2TP_SO_LNSMODE: *val = session->lns_mode; break; case PPPOL2TP_SO_DEBUG: /* Session debug flags option is deprecated */ *val = 0; break; case PPPOL2TP_SO_REORDERTO: *val = (int)jiffies_to_msecs(session->reorder_timeout); break; default: err = -ENOPROTOOPT; } return err; } /* Main getsockopt() entry point. * Does API checks, then calls either the tunnel or session getsockopt * handler, according to whether the PPPoX socket is a for a regular session * or the special tunnel type. */ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int val, len; int err; if (level != SOL_PPPOL2TP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); err = -ENOTCONN; if (!sk->sk_user_data) goto end; /* Get the session context */ err = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); if (err) goto end_put_sess; } else { err = pppol2tp_session_getsockopt(sk, session, optname, &val); if (err) goto end_put_sess; } err = -EFAULT; if (put_user(len, optlen)) goto end_put_sess; if (copy_to_user((void __user *)optval, &val, len)) goto end_put_sess; err = 0; end_put_sess: l2tp_session_put(session); end: return err; } /***************************************************************************** * /proc filesystem for debug * Since the original pppol2tp driver provided /proc/net/pppol2tp for * L2TPv2, we dump only L2TPv2 tunnels and sessions here. *****************************************************************************/ #ifdef CONFIG_PROC_FS struct pppol2tp_seq_data { struct seq_net_private p; unsigned long tkey; /* lookup key of current tunnel */ unsigned long skey; /* lookup key of current session */ struct l2tp_tunnel *tunnel; struct l2tp_session *session; /* NULL means get next tunnel */ }; static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->tunnel) l2tp_tunnel_put(pd->tunnel); for (;;) { pd->tunnel = l2tp_tunnel_get_next(net, &pd->tkey); pd->tkey++; /* Only accept L2TPv2 tunnels */ if (!pd->tunnel || pd->tunnel->version == 2) return; l2tp_tunnel_put(pd->tunnel); } } static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->session) l2tp_session_put(pd->session); pd->session = l2tp_session_get_next(net, pd->tunnel->sock, pd->tunnel->version, pd->tunnel->tunnel_id, &pd->skey); pd->skey++; if (!pd->session) { pd->skey = 0; pppol2tp_next_tunnel(net, pd); } } static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs) { struct pppol2tp_seq_data *pd = SEQ_START_TOKEN; loff_t pos = *offs; struct net *net; if (!pos) goto out; if (WARN_ON(!m->private)) { pd = NULL; goto out; } pd = m->private; net = seq_file_net(m); if (!pd->tunnel) pppol2tp_next_tunnel(net, pd); else pppol2tp_next_session(net, pd); /* NULL tunnel and session indicates end of list */ if (!pd->tunnel && !pd->session) pd = NULL; out: return pd; } static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return NULL; } static void pppol2tp_seq_stop(struct seq_file *p, void *v) { struct pppol2tp_seq_data *pd = v; if (!pd || pd == SEQ_START_TOKEN) return; /* Drop reference taken by last invocation of pppol2tp_next_session() * or pppol2tp_next_tunnel(). */ if (pd->session) { l2tp_session_put(pd->session); pd->session = NULL; } if (pd->tunnel) { l2tp_tunnel_put(pd->tunnel); pd->tunnel = NULL; } } static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) { struct l2tp_tunnel *tunnel = v; seq_printf(m, "\nTUNNEL '%s', %c %d\n", tunnel->name, tunnel->sock ? 'Y' : 'N', refcount_read(&tunnel->ref_count) - 1); seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), atomic_long_read(&tunnel->stats.rx_packets), atomic_long_read(&tunnel->stats.rx_bytes), atomic_long_read(&tunnel->stats.rx_errors)); } static void pppol2tp_seq_session_show(struct seq_file *m, void *v) { struct l2tp_session *session = v; struct l2tp_tunnel *tunnel = session->tunnel; unsigned char state; char user_data_ok; struct sock *sk; u32 ip = 0; u16 port = 0; if (tunnel->sock) { struct inet_sock *inet = inet_sk(tunnel->sock); ip = ntohl(inet->inet_saddr); port = ntohs(inet->inet_sport); } sk = pppol2tp_session_get_sock(session); if (sk) { state = sk->sk_state; user_data_ok = (session == sk->sk_user_data) ? 'Y' : 'N'; } else { state = 0; user_data_ok = 'N'; } seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> %04X/%04X %d %c\n", session->name, ip, port, tunnel->tunnel_id, session->session_id, tunnel->peer_tunnel_id, session->peer_session_id, state, user_data_ok); seq_printf(m, " 0/0/%c/%c/%s %08x %u\n", session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " %u/%u %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, atomic_long_read(&session->stats.tx_packets), atomic_long_read(&session->stats.tx_bytes), atomic_long_read(&session->stats.tx_errors), atomic_long_read(&session->stats.rx_packets), atomic_long_read(&session->stats.rx_bytes), atomic_long_read(&session->stats.rx_errors)); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); sock_put(sk); } } static int pppol2tp_seq_show(struct seq_file *m, void *v) { struct pppol2tp_seq_data *pd = v; /* display header on line 1 */ if (v == SEQ_START_TOKEN) { seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n"); seq_puts(m, "TUNNEL name, user-data-ok session-count\n"); seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); seq_puts(m, " SESSION name, addr/port src-tid/sid dest-tid/sid state user-data-ok\n"); seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n"); seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); goto out; } if (!pd->session) pppol2tp_seq_tunnel_show(m, pd->tunnel); else pppol2tp_seq_session_show(m, pd->session); out: return 0; } static const struct seq_operations pppol2tp_seq_ops = { .start = pppol2tp_seq_start, .next = pppol2tp_seq_next, .stop = pppol2tp_seq_stop, .show = pppol2tp_seq_show, }; #endif /* CONFIG_PROC_FS */ /***************************************************************************** * Network namespace *****************************************************************************/ static __net_init int pppol2tp_init_net(struct net *net) { struct proc_dir_entry *pde; int err = 0; pde = proc_create_net("pppol2tp", 0444, net->proc_net, &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data)); if (!pde) { err = -ENOMEM; goto out; } out: return err; } static __net_exit void pppol2tp_exit_net(struct net *net) { remove_proc_entry("pppol2tp", net->proc_net); } static struct pernet_operations pppol2tp_net_ops = { .init = pppol2tp_init_net, .exit = pppol2tp_exit_net, }; /***************************************************************************** * Init and cleanup *****************************************************************************/ static const struct proto_ops pppol2tp_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppol2tp_release, .bind = sock_no_bind, .connect = pppol2tp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, .getsockopt = pppol2tp_getsockopt, .sendmsg = pppol2tp_sendmsg, .recvmsg = pppol2tp_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppol2tp_proto = { .create = pppol2tp_create, .ioctl = pppol2tp_ioctl, .owner = THIS_MODULE, }; #ifdef CONFIG_L2TP_V3 static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = { .session_create = pppol2tp_session_create, .session_delete = l2tp_session_delete, }; #endif /* CONFIG_L2TP_V3 */ static int __init pppol2tp_init(void) { int err; err = register_pernet_device(&pppol2tp_net_ops); if (err) goto out; err = proto_register(&pppol2tp_sk_proto, 0); if (err) goto out_unregister_pppol2tp_pernet; err = register_pppox_proto(PX_PROTO_OL2TP, &pppol2tp_proto); if (err) goto out_unregister_pppol2tp_proto; #ifdef CONFIG_L2TP_V3 err = l2tp_nl_register_ops(L2TP_PWTYPE_PPP, &pppol2tp_nl_cmd_ops); if (err) goto out_unregister_pppox; #endif pr_info("PPPoL2TP kernel driver, %s\n", PPPOL2TP_DRV_VERSION); out: return err; #ifdef CONFIG_L2TP_V3 out_unregister_pppox: unregister_pppox_proto(PX_PROTO_OL2TP); #endif out_unregister_pppol2tp_proto: proto_unregister(&pppol2tp_sk_proto); out_unregister_pppol2tp_pernet: unregister_pernet_device(&pppol2tp_net_ops); goto out; } static void __exit pppol2tp_exit(void) { #ifdef CONFIG_L2TP_V3 l2tp_nl_unregister_ops(L2TP_PWTYPE_PPP); #endif unregister_pppox_proto(PX_PROTO_OL2TP); proto_unregister(&pppol2tp_sk_proto); unregister_pernet_device(&pppol2tp_net_ops); } module_init(pppol2tp_init); module_exit(pppol2tp_exit); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("PPP over L2TP over UDP"); MODULE_LICENSE("GPL"); MODULE_VERSION(PPPOL2TP_DRV_VERSION); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP); MODULE_ALIAS_L2TP_PWTYPE(7);
4 3 1 4598 321 137 137 137 92 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_XSTATE_H #define __X86_KERNEL_FPU_XSTATE_H #include <asm/cpufeature.h> #include <asm/fpu/xstate.h> #include <asm/fpu/xcr.h> #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u64, xfd_state); #endif static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) { /* * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } static inline u64 xstate_get_group_perm(bool guest) { struct fpu *fpu = &current->group_leader->thread.fpu; struct fpu_state_perm *perm; /* Pairs with WRITE_ONCE() in xstate_request_perm() */ perm = guest ? &fpu->guest_perm : &fpu->perm; return READ_ONCE(perm->__state_perm); } static inline u64 xstate_get_host_group_perm(void) { return xstate_get_group_perm(false); } enum xstate_copy_mode { XSTATE_COPY_FP, XSTATE_COPY_FX, XSTATE_COPY_XSAVE, }; struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u64 xfeatures, u32 pkru_val, enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); static inline u64 xfeatures_mask_supervisor(void) { return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; } static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; return fpu_kernel_cfg.independent_features; } /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " #else #define REX_PREFIX #endif /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" #define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" /* * After this @err contains 0 on success or the trap number when the * operation raises an exception. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ "2:\n\t" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor * states in addition to XSAVEC. * * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. * * Use XSAVE as a fallback. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ asm volatile("1: " ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ "3:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ #define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile("1: " ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \ : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") #if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); #else static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } #endif #ifdef CONFIG_X86_64 static inline void xfd_set_state(u64 xfd) { wrmsrl(MSR_IA32_XFD, xfd); __this_cpu_write(xfd_state, xfd); } static inline void xfd_update_state(struct fpstate *fpstate) { if (fpu_state_size_dynamic()) { u64 xfd = fpstate->xfd; if (__this_cpu_read(xfd_state) != xfd) xfd_set_state(xfd); } } extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); #else static inline void xfd_set_state(u64 xfd) { } static inline void xfd_update_state(struct fpstate *fpstate) { } static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { return -EPERM; } #endif /* * Save processor xstate to xsave area. * * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features * and command line options. The choice is permanent until the next reboot. */ static inline void os_xsave(struct fpstate *fpstate) { u64 mask = fpstate->xfeatures; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON_FPU(!alternatives_patched); xfd_validate_state(fpstate, mask, false); XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); } /* * Restore processor xstate from xsave area. * * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. */ static inline void os_xrstor(struct fpstate *fpstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; xfd_validate_state(fpstate, mask, true); XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); } /* Restore of supervisor state. Does not require XFD */ static inline void os_xrstor_supervisor(struct fpstate *fpstate) { u64 mask = xfeatures_mask_supervisor(); u32 lmask = mask; u32 hmask = mask >> 32; XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); } /* * XSAVE itself always writes all requested xfeatures. Removing features * from the request bitmap reduces the features which are written. * Generate a mask of features which must be written to a sigframe. The * unset features can be optimized away and not written. * * This optimization is user-visible. Only use for states where * uninitialized sigframe contents are tolerable, like dynamic features. * * Users of buffers produced with this optimization must check XSTATE_BV * to determine which features have been optimized out. */ static inline u64 xfeatures_need_sigframe_write(void) { u64 xfeaures_to_write; /* In-use features must be written: */ xfeaures_to_write = xfeatures_in_use(); /* Also write all non-optimizable sigframe features: */ xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_SIGFRAME_INITOPT; return xfeaures_to_write; } /* * Save xstate to user space xsave area. * * We don't use modified optimization because xrstor/xrstors might track * a different application. * * We don't use compacted format xsave area for backward compatibility for * old applications which don't understand the compacted format of the * xsave area. * * The caller has to zero buf::header before calling this because XSAVE* * does not touch the reserved fields in the header. */ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) { /* * Include the features which are not xsaved/rstored by the kernel * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ struct fpstate *fpstate = current->thread.fpu.fpstate; u64 mask = fpstate->user_xfeatures; u32 lmask; u32 hmask; int err; /* Optimize away writing unnecessary xfeatures: */ if (fpu_state_size_dynamic()) mask &= xfeatures_need_sigframe_write(); lmask = mask; hmask = mask >> 32; xfd_validate_state(fpstate, mask, false); stac(); XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); return err; } /* * Restore xstate from user space xsave area. */ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) { struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; int err; xfd_validate_state(current->thread.fpu.fpstate, mask, true); stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); clac(); return err; } /* * Restore xstate from kernel space xsave area, return an error code instead of * an exception. */ static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) { struct xregs_state *xstate = &fpstate->regs.xsave; u32 lmask = mask; u32 hmask = mask >> 32; int err; /* Ensure that XFD is up to date */ xfd_update_state(fpstate); if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); return err; } #endif
8 8 8 368 368 49 35 12 35 35 5 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H #include <linux/mm_types.h> #include <linux/mmu_notifier.h> #include <linux/sched.h> #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> #include <asm/smp.h> #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h> #include <asm/pgtable.h> DECLARE_PER_CPU(u64, tlbstate_untag_mask); void __flush_tlb_all(void); #define TLB_FLUSH_ALL -1UL #define TLB_GENERATION_INVALID 0 void cr4_update_irqsoff(unsigned long set, unsigned long clear); unsigned long cr4_read_shadow(void); /* Set in this cpu's CR4. */ static inline void cr4_set_bits_irqsoff(unsigned long mask) { cr4_update_irqsoff(mask, 0); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits_irqsoff(unsigned long mask) { cr4_update_irqsoff(0, mask); } /* Set in this cpu's CR4. */ static inline void cr4_set_bits(unsigned long mask) { unsigned long flags; local_irq_save(flags); cr4_set_bits_irqsoff(mask); local_irq_restore(flags); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits(unsigned long mask) { unsigned long flags; local_irq_save(flags); cr4_clear_bits_irqsoff(mask); local_irq_restore(flags); } #ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache * lines. */ #define TLB_NR_DYN_ASIDS 6 struct tlb_context { u64 ctx_id; u64 tlb_gen; }; struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. * * During switch_mm_irqs_off(), loaded_mm will be set to * LOADED_MM_SWITCHING during the brief interrupts-off window * when CR3 and loaded_mm would otherwise be inconsistent. This * is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; #define LOADED_MM_SWITCHING ((struct mm_struct *)1UL) /* Last user mm for optimizing IBPB */ union { struct mm_struct *last_user_mm; unsigned long last_user_mm_spec; }; u16 loaded_mm_asid; u16 next_asid; /* * If set we changed the page tables in such a way that we * needed an invalidation of all contexts (aka. PCIDs / ASIDs). * This tells us to go invalidate all the non-loaded ctxs[] * on the next context switch. * * The current ctx was kept up-to-date as it ran and does not * need to be invalidated. */ bool invalidate_other; #ifdef CONFIG_ADDRESS_MASKING /* * Active LAM mode. * * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM * disabled. */ u8 lam; #endif /* * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate * the corresponding user PCID needs a flush next time we * switch to it; see SWITCH_TO_USER_CR3. */ unsigned short user_pcid_flush_mask; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; /* * This is a list of all contexts that might exist in the TLB. * There is one per ASID that we use, and the ASID (what the * CPU calls PCID) is the index into ctxts. * * For each context, ctx_id indicates which mm the TLB's user * entries came from. As an invariant, the TLB will never * contain entries that are out-of-date as when that mm reached * the tlb_gen in the list. * * To be clear, this means that it's legal for the TLB code to * flush the TLB without updating tlb_gen. This can happen * (for now, at least) due to paravirt remote flushes. * * NB: context 0 is a bit special, since it's also used by * various bits of init code. This is fine -- code that * isn't aware of PCID will end up harmlessly flushing * context 0. */ struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate); struct tlb_state_shared { /* * We can be in one of several states: * * - Actively using an mm. Our CPU's bit will be set in * mm_cpumask(loaded_mm) and is_lazy == false; * * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit * will not be set in mm_cpumask(&init_mm) and is_lazy == false. * * - Lazily using a real mm. loaded_mm != &init_mm, our bit * is set in mm_cpumask(loaded_mm), but is_lazy == true. * We're heuristically guessing that the CR3 load we * skipped more than makes up for the overhead added by * lazy mode. */ bool is_lazy; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); bool nmi_uaccess_okay(void); #define nmi_uaccess_okay nmi_uaccess_okay /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; extern void initialize_tlbstate_and_flush(void); /* * TLB flushing: * * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages * - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { /* * We support several kinds of flushes. * * - Fully flush a single mm. .mm will be set, .end will be * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to * which the IPI sender is trying to catch us up. * * - Partially flush a single mm. .mm will be set, .start and * .end will indicate the range, and .new_tlb_gen will be set * such that the changes between generation .new_tlb_gen-1 and * .new_tlb_gen are entirely contained in the indicated range. * * - Fully flush all mms whose tlb_gens have been updated. .mm * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen * will be zero. */ struct mm_struct *mm; unsigned long start; unsigned long end; u64 new_tlb_gen; unsigned int initiating_cpu; u8 stride_shift; u8 freed_tables; }; void flush_tlb_local(void); void flush_tlb_one_user(unsigned long addr); void flush_tlb_one_kernel(unsigned long addr); void flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif #define flush_tlb_mm(mm) \ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) #define flush_tlb_range(vma, start, end) \ flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ : PAGE_SHIFT, false) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { bool should_defer = false; /* If remote CPUs need to be flushed then defer batch the flush */ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) should_defer = true; put_cpu(); return should_defer; } static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* * Bump the generation count. This also serves as a full barrier * that synchronizes with switch_mm(): callers are required to order * their read of mm_cpumask after their writes to the paging * structures. */ return atomic64_inc_return(&mm->context.tlb_gen); } static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm, unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { flush_tlb_mm(mm); } extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); static inline bool pte_flags_need_flush(unsigned long oldflags, unsigned long newflags, bool ignore_access) { /* * Flags that require a flush when cleared but not when they are set. * Only include flags that would not trigger spurious page-faults. * Non-present entries are not cached. Hardware would set the * dirty/access bit if needed without a fault. */ const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED; const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3 | _PAGE_SOFTW4 | _PAGE_SAVED_DIRTY; const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX; unsigned long diff = oldflags ^ newflags; BUILD_BUG_ON(flush_on_clear & software_flags); BUILD_BUG_ON(flush_on_clear & flush_on_change); BUILD_BUG_ON(flush_on_change & software_flags); /* Ignore software flags */ diff &= ~software_flags; if (ignore_access) diff &= ~_PAGE_ACCESSED; /* * Did any of the 'flush_on_clear' flags was clleared set from between * 'oldflags' and 'newflags'? */ if (diff & oldflags & flush_on_clear) return true; /* Flush on modified flags. */ if (diff & flush_on_change) return true; /* Ensure there are no flags that were left behind */ if (IS_ENABLED(CONFIG_DEBUG_VM) && (diff & ~(flush_on_clear | software_flags | flush_on_change))) { VM_WARN_ON_ONCE(1); return true; } return false; } /* * pte_needs_flush() checks whether permissions were demoted and require a * flush. It should only be used for userspace PTEs. */ static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { /* !PRESENT -> * ; no need for flush */ if (!(pte_flags(oldpte) & _PAGE_PRESENT)) return false; /* PFN changed ; needs flush */ if (pte_pfn(oldpte) != pte_pfn(newpte)) return true; /* * check PTE flags; ignore access-bit; see comment in * ptep_clear_flush_young(). */ return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte), true); } #define pte_needs_flush pte_needs_flush /* * huge_pmd_needs_flush() checks whether permissions were demoted and require a * flush. It should only be used for userspace huge PMDs. */ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { /* !PRESENT -> * ; no need for flush */ if (!(pmd_flags(oldpmd) & _PAGE_PRESENT)) return false; /* PFN changed ; needs flush */ if (pmd_pfn(oldpmd) != pmd_pfn(newpmd)) return true; /* * check PMD flags; do not ignore access-bit; see * pmdp_clear_flush_young(). */ return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd), false); } #define huge_pmd_needs_flush huge_pmd_needs_flush #ifdef CONFIG_ADDRESS_MASKING static inline u64 tlbstate_lam_cr3_mask(void) { u64 lam = this_cpu_read(cpu_tlbstate.lam); return lam << X86_CR3_LAM_U57_BIT; } static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); this_cpu_write(tlbstate_untag_mask, untag_mask); } #else static inline u64 tlbstate_lam_cr3_mask(void) { return 0; } static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { } #endif #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) { native_write_cr4(cr4 ^ X86_CR4_PGE); native_write_cr4(cr4); } #endif /* _ASM_X86_TLBFLUSH_H */
591 588 4 4 3 2 2 2 1 7 6 6 3 2 3 3 1 1 2 57 36 1 2 2 2 2 1 7 6 5 4 2 1 1 7 6 5 5 7 14 3 14 8 14 4 14 7 14 2 13 1 14 4 20 1 20 1 20 3 20 2 20 1 20 1 20 1 19 2 2 14 15 11 14 14 12 14 2 14 14 1 12 1 12 1 12 10 12 9 17 16 15 15 15 3 12 15 12 15 17 2 36 2 4 4 4 2 2 15 15 13 2 2 1 1 1 658 1 1 2 7 3 2 660 2 1 4 4 5 3 7 62 61 2 3 4 4 15 2 1 528 89 663 662 657 658 532 656 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ioctl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/compat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include <linux/mount.h> #include <linux/fscrypt.h> #include <linux/fileattr.h> #include "internal.h" #include <asm/ioctls.h> /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) /** * vfs_ioctl - call filesystem specific ioctl methods * @filp: open file to invoke ioctl method on * @cmd: ioctl command to execute * @arg: command-specific argument for ioctl * * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise * returns -ENOTTY. * * Returns 0 on success, -errno on error. */ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; if (!filp->f_op->unlocked_ioctl) goto out; error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; out: return error; } EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int error, ur_block; sector_t block; if (!capable(CAP_SYS_RAWIO)) return -EPERM; error = get_user(ur_block, p); if (error) return error; if (ur_block < 0) return -EINVAL; block = ur_block; error = bmap(inode, &block); if (block > INT_MAX) { error = -ERANGE; pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n", current->comm, task_pid_nr(current), sb->s_id, filp); } if (error) ur_block = 0; else ur_block = block; if (put_user(ur_block, p)) error = -EFAULT; return error; } /** * fiemap_fill_next_extent - Fiemap helper function * @fieinfo: Fiemap context passed into ->fiemap * @logical: Extent logical start offset, in bytes * @phys: Extent physical start offset, in bytes * @len: Extent length, in bytes * @flags: FIEMAP_EXTENT flags that describe this extent * * Called from file system ->fiemap callback. Will populate extent * info as passed in via arguments and copy to user memory. On * success, extent count on fieinfo is incremented. * * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { struct fiemap_extent extent; struct fiemap_extent __user *dest = fieinfo->fi_extents_start; /* only count the extents */ if (fieinfo->fi_extents_max == 0) { fieinfo->fi_extents_mapped++; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; #define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) #define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) #define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) flags |= FIEMAP_EXTENT_ENCODED; if (flags & SET_NOT_ALIGNED_FLAGS) flags |= FIEMAP_EXTENT_NOT_ALIGNED; memset(&extent, 0, sizeof(extent)); extent.fe_logical = logical; extent.fe_physical = phys; extent.fe_length = len; extent.fe_flags = flags; dest += fieinfo->fi_extents_mapped; if (copy_to_user(dest, &extent, sizeof(extent))) return -EFAULT; fieinfo->fi_extents_mapped++; if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) return 1; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } EXPORT_SYMBOL(fiemap_fill_next_extent); /** * fiemap_prep - check validity of requested flags for fiemap * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap * @start: Start of the mapped range * @len: Length of the mapped range, can be truncated by this function. * @supported_flags: Set of fiemap flags that the file system understands * * This function must be called from each ->fiemap instance to validate the * fiemap request against the file system parameters. * * Returns 0 on success, or a negative error on failure. */ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 *len, u32 supported_flags) { u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; int ret = 0; if (*len == 0) return -EINVAL; if (start >= maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; supported_flags |= FIEMAP_FLAG_SYNC; supported_flags &= FIEMAP_FLAGS_COMPAT; incompat_flags = fieinfo->fi_flags & ~supported_flags; if (incompat_flags) { fieinfo->fi_flags = incompat_flags; return -EBADR; } if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) ret = filemap_write_and_wait(inode->i_mapping); return ret; } EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); int error; if (!inode->i_op->fiemap) return -EOPNOTSUPP; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; } static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { CLASS(fd, src_file)(srcfd); loff_t cloned; int ret; if (fd_empty(src_file)) return -EBADF; cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff, olen, 0); if (cloned < 0) ret = cloned; else if (olen && cloned != olen) ret = -EINVAL; else ret = 0; return ret; } static long ioctl_file_clone_range(struct file *file, struct file_clone_range __user *argp) { struct file_clone_range args; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; return ioctl_file_clone(file, args.src_fd, args.src_offset, args.src_length, args.dest_offset); } /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. * * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += filp->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, int mode, struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += file->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } #endif static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); case FS_IOC_RESVSP: case FS_IOC_RESVSP64: return ioctl_preallocate(filp, 0, p); case FS_IOC_UNRESVSP: case FS_IOC_UNRESVSP64: return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p); case FS_IOC_ZERO_RANGE: return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = O_NONBLOCK; #ifdef __sparc__ /* SunOS compatibility item. */ if (O_NONBLOCK != O_NDELAY) flag |= O_NDELAY; #endif spin_lock(&filp->f_lock); if (on) filp->f_flags |= flag; else filp->f_flags &= ~flag; spin_unlock(&filp->f_lock); return error; } static int ioctl_fioasync(unsigned int fd, struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = on ? FASYNC : 0; /* Did FASYNC state change ? */ if ((flag ^ filp->f_flags) & FASYNC) { if (filp->f_op->fasync) /* fasync() adjusts filp->f_flags */ error = filp->f_op->fasync(fd, filp, on); else error = -ENOTTY; } return error < 0 ? error : 0; } static int ioctl_fsfreeze(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL) return -EOPNOTSUPP; /* Freeze */ if (sb->s_op->freeze_super) return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); return freeze_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_fsthaw(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Thaw */ if (sb->s_op->thaw_super) return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); return thaw_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_file_dedupe_range(struct file *file, struct file_dedupe_range __user *argp) { struct file_dedupe_range *same = NULL; int ret; unsigned long size; u16 count; if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } size = offsetof(struct file_dedupe_range, info[count]); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; } same = memdup_user(argp, size); if (IS_ERR(same)) { ret = PTR_ERR(same); same = NULL; goto out; } same->dest_count = count; ret = vfs_dedupe_file_range(file, same); if (ret) goto out; ret = copy_to_user(argp, same, size); if (ret) ret = -EFAULT; out: kfree(same); return ret; } /** * fileattr_fill_xflags - initialize fileattr with xflags * @fa: fileattr pointer * @xflags: FS_XFLAG_* flags * * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All * other fields are zeroed. */ void fileattr_fill_xflags(struct fileattr *fa, u32 xflags) { memset(fa, 0, sizeof(*fa)); fa->fsx_valid = true; fa->fsx_xflags = xflags; if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE) fa->flags |= FS_IMMUTABLE_FL; if (fa->fsx_xflags & FS_XFLAG_APPEND) fa->flags |= FS_APPEND_FL; if (fa->fsx_xflags & FS_XFLAG_SYNC) fa->flags |= FS_SYNC_FL; if (fa->fsx_xflags & FS_XFLAG_NOATIME) fa->flags |= FS_NOATIME_FL; if (fa->fsx_xflags & FS_XFLAG_NODUMP) fa->flags |= FS_NODUMP_FL; if (fa->fsx_xflags & FS_XFLAG_DAX) fa->flags |= FS_DAX_FL; if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) fa->flags |= FS_PROJINHERIT_FL; } EXPORT_SYMBOL(fileattr_fill_xflags); /** * fileattr_fill_flags - initialize fileattr with flags * @fa: fileattr pointer * @flags: FS_*_FL flags * * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags). * All other fields are zeroed. */ void fileattr_fill_flags(struct fileattr *fa, u32 flags) { memset(fa, 0, sizeof(*fa)); fa->flags_valid = true; fa->flags = flags; if (fa->flags & FS_SYNC_FL) fa->fsx_xflags |= FS_XFLAG_SYNC; if (fa->flags & FS_IMMUTABLE_FL) fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; if (fa->flags & FS_APPEND_FL) fa->fsx_xflags |= FS_XFLAG_APPEND; if (fa->flags & FS_NODUMP_FL) fa->fsx_xflags |= FS_XFLAG_NODUMP; if (fa->flags & FS_NOATIME_FL) fa->fsx_xflags |= FS_XFLAG_NOATIME; if (fa->flags & FS_DAX_FL) fa->fsx_xflags |= FS_XFLAG_DAX; if (fa->flags & FS_PROJINHERIT_FL) fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; } EXPORT_SYMBOL(fileattr_fill_flags); /** * vfs_fileattr_get - retrieve miscellaneous file attributes * @dentry: the object to retrieve from * @fa: fileattr pointer * * Call i_op->fileattr_get() callback, if exists. * * Return: 0 on success, or a negative error on failure. */ int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); if (!inode->i_op->fileattr_get) return -ENOIOCTLCMD; return inode->i_op->fileattr_get(dentry, fa); } EXPORT_SYMBOL(vfs_fileattr_get); /** * copy_fsxattr_to_user - copy fsxattr to userspace. * @fa: fileattr pointer * @ufa: fsxattr user pointer * * Return: 0 on success, or -EFAULT on failure. */ int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; memset(&xfa, 0, sizeof(xfa)); xfa.fsx_xflags = fa->fsx_xflags; xfa.fsx_extsize = fa->fsx_extsize; xfa.fsx_nextents = fa->fsx_nextents; xfa.fsx_projid = fa->fsx_projid; xfa.fsx_cowextsize = fa->fsx_cowextsize; if (copy_to_user(ufa, &xfa, sizeof(xfa))) return -EFAULT; return 0; } EXPORT_SYMBOL(copy_fsxattr_to_user); static int copy_fsxattr_from_user(struct fileattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; if (copy_from_user(&xfa, ufa, sizeof(xfa))) return -EFAULT; fileattr_fill_xflags(fa, xfa.fsx_xflags); fa->fsx_extsize = xfa.fsx_extsize; fa->fsx_nextents = xfa.fsx_nextents; fa->fsx_projid = xfa.fsx_projid; fa->fsx_cowextsize = xfa.fsx_cowextsize; return 0; } /* * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject * any invalid configurations. * * Note: must be called with inode lock held. */ static int fileattr_set_prepare(struct inode *inode, const struct fileattr *old_ma, struct fileattr *fa) { int err; /* * The IMMUTABLE and APPEND_ONLY flags can only be changed by * the relevant capability. */ if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags); if (err) return err; /* * Project Quota ID state is only allowed to change from within the init * namespace. Enforce that restriction only if we are trying to change * the quota ID state. Everything else is allowed in user namespaces. */ if (current_user_ns() != &init_user_ns) { if (old_ma->fsx_projid != fa->fsx_projid) return -EINVAL; if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & FS_XFLAG_PROJINHERIT) return -EINVAL; } else { /* * Caller is allowed to change the project ID. If it is being * changed, make sure that the new value is valid. */ if (old_ma->fsx_projid != fa->fsx_projid && !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid))) return -EINVAL; } /* Check extent size hints. */ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(inode->i_mode)) return -EINVAL; if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return -EINVAL; /* * It is only valid to set the DAX flag on regular files and * directories on filesystems. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; /* Extent size hints of zero turn off the flags. */ if (fa->fsx_extsize == 0) fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); if (fa->fsx_cowextsize == 0) fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; return 0; } /** * vfs_fileattr_set - change miscellaneous file attributes * @idmap: idmap of the mount * @dentry: the object to change * @fa: fileattr pointer * * After verifying permissions, call i_op->fileattr_set() callback, if * exists. * * Verifying attributes involves retrieving current attributes with * i_op->fileattr_get(), this also allows initializing attributes that have * not been set by the caller to current values. Inode lock is held * thoughout to prevent racing with another instance. * * Return: 0 on success, or a negative error on failure. */ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct fileattr old_ma = {}; int err; if (!inode->i_op->fileattr_set) return -ENOIOCTLCMD; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; inode_lock(inode); err = vfs_fileattr_get(dentry, &old_ma); if (!err) { /* initialize missing bits from old_ma */ if (fa->flags_valid) { fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON; fa->fsx_extsize = old_ma.fsx_extsize; fa->fsx_nextents = old_ma.fsx_nextents; fa->fsx_projid = old_ma.fsx_projid; fa->fsx_cowextsize = old_ma.fsx_cowextsize; } else { fa->flags |= old_ma.flags & ~FS_COMMON_FL; } err = fileattr_set_prepare(inode, &old_ma, fa); if (!err) err = inode->i_op->fileattr_set(idmap, dentry, fa); } inode_unlock(inode); return err; } EXPORT_SYMBOL(vfs_fileattr_set); static int ioctl_getflags(struct file *file, unsigned int __user *argp) { struct fileattr fa = { .flags_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); if (!err) err = put_user(fa.flags, argp); return err; } static int ioctl_setflags(struct file *file, unsigned int __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; unsigned int flags; int err; err = get_user(flags, argp); if (!err) { err = mnt_want_write_file(file); if (!err) { fileattr_fill_flags(&fa, flags); err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } return err; } static int ioctl_fsgetxattr(struct file *file, void __user *argp) { struct fileattr fa = { .fsx_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); if (!err) err = copy_fsxattr_to_user(&fa, argp); return err; } static int ioctl_fssetxattr(struct file *file, void __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; struct fileattr fa; int err; err = copy_fsxattr_from_user(&fa, argp); if (!err) { err = mnt_want_write_file(file); if (!err) { err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); } } return err; } static int ioctl_getfsuuid(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; struct fsuuid2 u = { .len = sb->s_uuid_len, }; if (!sb->s_uuid_len) return -ENOTTY; memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len); return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; } static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp) { struct super_block *sb = file_inode(file)->i_sb; if (!strlen(sb->s_sysfs_name)) return -ENOTTY; struct fs_sysfs_path u = {}; u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name); return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; } /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. * * When you add any new common ioctls to the switches above and below, * please ensure they have compatible arguments in compat mode. * * The LSM mailing list should also be notified of any command additions or * changes, as specific LSMs may be affected. */ static int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); return 0; case FIONCLEX: set_close_on_exec(fd, 0); return 0; case FIONBIO: return ioctl_fionbio(filp, argp); case FIOASYNC: return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); return copy_to_user(argp, &res, sizeof(res)) ? -EFAULT : 0; } return -ENOTTY; case FIFREEZE: return ioctl_fsfreeze(filp); case FITHAW: return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); case FIGETBSZ: /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: return ioctl_file_clone(filp, arg, 0, 0, 0); case FICLONERANGE: return ioctl_file_clone_range(filp, argp); case FIDEDUPERANGE: return ioctl_file_dedupe_range(filp, argp); case FIONREAD: if (!S_ISREG(inode->i_mode)) return vfs_ioctl(filp, cmd, arg); return put_user(i_size_read(inode) - filp->f_pos, (int __user *)argp); case FS_IOC_GETFLAGS: return ioctl_getflags(filp, argp); case FS_IOC_SETFLAGS: return ioctl_setflags(filp, argp); case FS_IOC_FSGETXATTR: return ioctl_fsgetxattr(filp, argp); case FS_IOC_FSSETXATTR: return ioctl_fssetxattr(filp, argp); case FS_IOC_GETFSUUID: return ioctl_getfsuuid(filp, argp); case FS_IOC_GETFSSYSFSPATH: return ioctl_get_fs_sysfs_path(filp, argp); default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); break; } return -ENOIOCTLCMD; } SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { CLASS(fd, f)(fd); int error; if (fd_empty(f)) return -EBADF; error = security_file_ioctl(fd_file(f), cmd, arg); if (error) return error; error = do_vfs_ioctl(fd_file(f), fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(fd_file(f), cmd, arg); return error; } #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation * @file: The file to operate on. * @cmd: The ioctl command number. * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as * * .compat_ioctl = compat_ptr_ioctl, * * On most architectures, the compat_ptr_ioctl() just passes all arguments * to the corresponding ->ioctl handler. The exception is arch/s390, where * compat_ptr() clears the top bit of a 32-bit pointer value, so user space * pointers to the second 2GB alias the first 2GB, as is the case for * native 32-bit s390 user space. * * The compat_ptr_ioctl() function must therefore be used only with ioctl * functions that either ignore the argument or pass a pointer to a * compatible data type. * * If any ioctl command handled by fops->unlocked_ioctl passes a plain * integer instead of a pointer, or any of the passed data types * is incompatible between 32-bit and 64-bit architectures, a proper * handler is required instead of compat_ptr_ioctl. */ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { if (!file->f_op->unlocked_ioctl) return -ENOIOCTLCMD; return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(compat_ptr_ioctl); COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { CLASS(fd, f)(fd); int error; if (fd_empty(f)) return -EBADF; error = security_file_ioctl_compat(fd_file(f), cmd, arg); if (error) return error; switch (cmd) { /* FICLONE takes an int argument, so don't use compat_ptr() */ case FICLONE: error = ioctl_file_clone(fd_file(f), arg, 0, 0, 0); break; #if defined(CONFIG_X86_64) /* these get messy on amd64 due to alignment differences */ case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: error = compat_ioctl_preallocate(fd_file(f), 0, compat_ptr(arg)); break; case FS_IOC_UNRESVSP_32: case FS_IOC_UNRESVSP64_32: error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_PUNCH_HOLE, compat_ptr(arg)); break; case FS_IOC_ZERO_RANGE_32: error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_ZERO_RANGE, compat_ptr(arg)); break; #endif /* * These access 32-bit values anyway so no further handling is * necessary. */ case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: cmd = (cmd == FS_IOC32_GETFLAGS) ? FS_IOC_GETFLAGS : FS_IOC_SETFLAGS; fallthrough; /* * everything else in do_vfs_ioctl() takes either a compatible * pointer argument or no argument -- call it with a modified * argument. */ default: error = do_vfs_ioctl(fd_file(f), fd, cmd, (unsigned long)compat_ptr(arg)); if (error != -ENOIOCTLCMD) break; if (fd_file(f)->f_op->compat_ioctl) error = fd_file(f)->f_op->compat_ioctl(fd_file(f), cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; break; } return error; } #endif
40 39 35 35 34 26 40 40 16 15 16 12 6 80 80 80 21 21 9 21 3 21 21 23 9 2 7 7 7 23 13 23 23 19 22 23 7 23 7 7 7 2 5 5 3 5 3 7 5 1 5 4 4 6 5 1 2 1 1 6 3 2 1 1 1 1 1 2 5 5 4 3 3 3 2 1 3 5 4 3 2 1 1 2 1 1 4 16 15 14 22 11 1 1 4 3 2 1 1 1 11 38 38 22 3 3 3 3 21 20 20 2 3 7 3 1 21 1 1 2 1 1 1 1 1 7 21 25 25 21 5 5 5 4 4 4 4 4 5 7 6 6 2 2 5 4 1 3 2 1 1 1 1 1 2 1 7 10 7 8 9 9 9 10 8 21 2 19 2 19 9 1 8 8 8 9 9 21 21 21 21 21 21 20 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/proto.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/random.h> #include <linux/slab.h> #include <net/checksum.h> #include <net/inet_sock.h> #include <net/inet_common.h> #include <net/sock.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> #include "ccid.h" #include "dccp.h" #include "feat.h" #define CREATE_TRACE_POINTS #include "trace.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); DEFINE_PER_CPU(unsigned int, dccp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ int sysctl_dccp_tx_qlen __read_mostly = 5; #ifdef CONFIG_IP_DCCP_DEBUG static const char *dccp_state_name(const int state) { static const char *const dccp_state_names[] = { [DCCP_OPEN] = "OPEN", [DCCP_REQUESTING] = "REQUESTING", [DCCP_PARTOPEN] = "PARTOPEN", [DCCP_LISTEN] = "LISTEN", [DCCP_RESPOND] = "RESPOND", [DCCP_CLOSING] = "CLOSING", [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", [DCCP_TIME_WAIT] = "TIME_WAIT", [DCCP_CLOSED] = "CLOSED", }; if (state >= DCCP_MAX_STATES) return "INVALID STATE!"; else return dccp_state_names[state]; } #endif void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, dccp_state_name(oldstate), dccp_state_name(state)); WARN_ON(state == oldstate); switch (state) { case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); /* Client retransmits all Confirm options until entering OPEN */ if (oldstate == DCCP_PARTOPEN) dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || oldstate == DCCP_CLOSING) DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash != NULL && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == DCCP_OPEN) DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_set_state(sk, state); } EXPORT_SYMBOL_GPL(dccp_set_state); static void dccp_finish_passive_close(struct sock *sk) { switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: /* Node (client or server) has received Close packet. */ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_set_state(sk, DCCP_CLOSED); break; case DCCP_PASSIVE_CLOSEREQ: /* * Client received CloseReq. We set the `active' flag so that * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. */ dccp_send_close(sk, 1); dccp_set_state(sk, DCCP_CLOSING); } } void dccp_done(struct sock *sk) { dccp_set_state(sk, DCCP_CLOSED); dccp_clear_xmit_timers(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(dccp_done); const char *dccp_packet_name(const int type) { static const char *const dccp_packet_names[] = { [DCCP_PKT_REQUEST] = "REQUEST", [DCCP_PKT_RESPONSE] = "RESPONSE", [DCCP_PKT_DATA] = "DATA", [DCCP_PKT_ACK] = "ACK", [DCCP_PKT_DATAACK] = "DATAACK", [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", [DCCP_PKT_CLOSE] = "CLOSE", [DCCP_PKT_RESET] = "RESET", [DCCP_PKT_SYNC] = "SYNC", [DCCP_PKT_SYNCACK] = "SYNCACK", }; if (type >= DCCP_NR_PKT_TYPES) return "INVALID"; else return dccp_packet_names[type]; } EXPORT_SYMBOL_GPL(dccp_packet_name); void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; } EXPORT_SYMBOL_GPL(dccp_destruct_common); static void dccp_sk_destruct(struct sock *sk) { dccp_destruct_common(sk); inet_sock_destruct(sk); } int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); INIT_LIST_HEAD(&dp->dccps_featneg); /* control socket doesn't need feat nego */ if (likely(ctl_sock_initialized)) return dccp_feat_init(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } /* Clean up a referenced DCCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(sk); kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); static inline int dccp_need_reset(int state) { return state != DCCP_CLOSED && state != DCCP_LISTEN && state != DCCP_REQUESTING; } int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); const int old_state = sk->sk_state; if (old_state != DCCP_CLOSED) dccp_set_state(sk, DCCP_CLOSED); /* * This corresponds to the ABORT function of RFC793, sec. 3.8 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". */ if (old_state == DCCP_LISTEN) { inet_csk_listen_stop(sk); } else if (dccp_need_reset(old_state)) { dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); sk->sk_err = ECONNRESET; } else if (old_state == DCCP_REQUESTING) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { __kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); icsk->icsk_backoff = 0; inet_csk_delack_init(sk); __sk_dst_reset(sk); WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk_error_report(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_disconnect); /* * Wait for a DCCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == DCCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes made by another threads is impossible in any case. */ mask = 0; if (READ_ONCE(sk->sk_err)) mask = EPOLLERR; shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected? */ if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { if (atomic_read(&sk->sk_rmem_alloc) > 0) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (sk_stream_is_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. */ if (sk_stream_is_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; } } } return mask; } EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) goto out; switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and * always 0, comparably to UDP. */ rc = 0; } break; case SIOCINQ: { struct sk_buff *skb; *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount of this packet since * that is all that will be read. */ *karg = skb->len; } rc = 0; } break; default: rc = -ENOIOCTLCMD; break; } out: release_sock(sk); return rc; } EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; if (service == DCCP_SERVICE_INVALID_VALUE || optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) return -EINVAL; if (optlen > sizeof(service)) { sl = kmalloc(optlen, GFP_KERNEL); if (sl == NULL) return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; if (copy_from_sockptr_offset(sl->dccpsl_list, optval, sizeof(service), optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; } } lock_sock(sk); dp->dccps_service = service; kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); return 0; } static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) { u8 *list, len; int i, rc; if (cscov < 0 || cscov > 15) return -EINVAL; /* * Populate a list of permissible values, in the range cscov...15. This * is necessary since feature negotiation of single values only works if * both sides incidentally choose the same value. Since the list starts * lowest-value first, negotiation will pick the smallest shared value. */ if (cscov == 0) return 0; len = 16 - cscov; list = kmalloc(len, GFP_KERNEL); if (list == NULL) return -ENOBUFS; for (i = 0; i < len; i++) list[i] = cscov++; rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); if (rc == 0) { if (rx) dccp_sk(sk)->dccps_pcrlen = cscov; else dccp_sk(sk)->dccps_pcslen = cscov; } kfree(list); return rc; } static int dccp_setsockopt_ccid(struct sock *sk, int type, sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); lock_sock(sk); if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); release_sock(sk); kfree(val); return rc; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CHANGE_L: case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CCID: case DCCP_SOCKOPT_RX_CCID: case DCCP_SOCKOPT_TX_CCID: return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) return dccp_setsockopt_service(sk, val, optval, optlen); lock_sock(sk); switch (optname) { case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; case DCCP_SOCKOPT_SEND_CSCOV: err = dccp_setsockopt_cscov(sk, val, false); break; case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; case DCCP_SOCKOPT_QPOLICY_ID: if (sk->sk_state != DCCP_CLOSED) err = -EISCONN; else if (val < 0 || val >= DCCPQ_POLICY_MAX) err = -EINVAL; else dp->dccps_qpolicy = val; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: if (val < 0) err = -EINVAL; else dp->dccps_tx_qlen = val; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); return do_dccp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_setsockopt); static int dccp_getsockopt_service(struct sock *sk, int len, __be32 __user *optval, int __user *optlen) { const struct dccp_sock *dp = dccp_sk(sk); const struct dccp_service_list *sl; int err = -ENOENT, slen = 0, total_len = sizeof(u32); lock_sock(sk); if ((sl = dp->dccps_service_list) != NULL) { slen = sl->dccpsl_nr * sizeof(u32); total_len += slen; } err = -EINVAL; if (total_len > len) goto out; err = 0; if (put_user(total_len, optlen) || put_user(dp->dccps_service, optval) || (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) err = -EFAULT; out: release_sock(sk); return err; } static int do_dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dccp_sock *dp; int val, len; if (get_user(len, optlen)) return -EFAULT; if (len < (int)sizeof(int)) return -EINVAL; dp = dccp_sk(sk); switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_SERVICE: return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_TX_CCID: val = ccid_get_current_tx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_RX_CCID: val = ccid_get_current_rx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; case DCCP_SOCKOPT_SEND_CSCOV: val = dp->dccps_pcslen; break; case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; case DCCP_SOCKOPT_QPOLICY_ID: val = dp->dccps_qpolicy; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: val = dp->dccps_tx_qlen; break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); case 192 ... 255: return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, len, (u32 __user *)optval, optlen); default: return -ENOPROTOOPT; } len = sizeof(val); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); return do_dccp_getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_getsockopt); static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { struct cmsghdr *cmsg; /* * Assign an (opaque) qpolicy priority value to skb->priority. * * We are overloading this skb field for use with the qpolicy subystem. * The skb->priority is normally used for the SO_PRIORITY option, which * is initialised from sk_priority. Since the assignment of sk_priority * to skb->priority happens later (on layer 3), we overload this field * for use with queueing priorities as long as the skb is on layer 4. * The default priority value (if nothing is set) is 0. */ skb->priority = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_DCCP) continue; if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) return -EINVAL; switch (cmsg->cmsg_type) { case DCCP_SCM_PRIORITY: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) return -EINVAL; skb->priority = *(__u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { const struct dccp_sock *dp = dccp_sk(sk); const int flags = msg->msg_flags; const int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; int rc, size; long timeo; trace_dccp_probe(sk, len); if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); timeo = sock_sndtimeo(sk, noblock); /* * We have to use sk_stream_wait_connect here to set sk_write_pending, * so that the trick in dccp_rcv_request_sent_state_process. */ /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_release; size = sk->sk_prot->max_header + len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (skb == NULL) goto out_release; if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_discard; } if (sk->sk_state == DCCP_CLOSED) { rc = -ENOTCONN; goto out_discard; } /* We need to check dccps_mss_cache after socket is locked. */ if (len > dp->dccps_mss_cache) { rc = -EMSGSIZE; goto out_discard; } skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) goto out_discard; rc = dccp_msghdr_parse(msg, skb); if (rc != 0) goto out_discard; dccp_qpolicy_push(sk, skb); /* * The xmit_timer is set if the TX CCID is rate-based and will expire * when congestion control permits to release further packets into the * network. Window-based CCIDs do not use this timer. */ if (!timer_pending(&dp->dccps_xmit_timer)) dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; out_discard: kfree_skb(skb); goto out_release; } EXPORT_SYMBOL_GPL(dccp_sendmsg); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { const struct dccp_hdr *dh; long timeo; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) { len = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) goto verify_sock_status; dh = dccp_hdr(skb); switch (dh->dccph_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: goto found_ok_skb; case DCCP_PKT_CLOSE: case DCCP_PKT_CLOSEREQ: if (!(flags & MSG_PEEK)) dccp_finish_passive_close(sk); fallthrough; case DCCP_PKT_RESET: dccp_pr_debug("found fin (%s) ok!\n", dccp_packet_name(dh->dccph_type)); len = 0; goto found_fin_ok; default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; break; } if (sk->sk_err) { len = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { len = 0; break; } if (sk->sk_state == DCCP_CLOSED) { if (!sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read * from never connected socket. */ len = -ENOTCONN; break; } len = 0; break; } if (!timeo) { len = -EAGAIN; break; } if (signal_pending(current)) { len = sock_intr_errno(timeo); break; } sk_wait_data(sk, &timeo, NULL); continue; found_ok_skb: if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; } if (flags & MSG_TRUNC) len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; } while (1); out: release_sock(sk); return len; } EXPORT_SYMBOL_GPL(dccp_recvmsg); int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != DCCP_LISTEN) { struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; /* do not start to listen if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dp)) { err = -EPROTO; goto out; } err = inet_csk_listen_start(sk); if (err) goto out; } err = 0; out: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(inet_dccp_listen); static void dccp_terminate_connection(struct sock *sk) { u8 next_state = DCCP_CLOSED; switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: case DCCP_PASSIVE_CLOSEREQ: dccp_finish_passive_close(sk); break; case DCCP_PARTOPEN: dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); fallthrough; case DCCP_OPEN: dccp_send_close(sk, 1); if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && !dccp_sk(sk)->dccps_server_timewait) next_state = DCCP_ACTIVE_CLOSEREQ; else next_state = DCCP_CLOSING; fallthrough; default: dccp_set_state(sk, next_state); } } void dccp_close(struct sock *sk, long timeout) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; u32 data_was_unread = 0; int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == DCCP_LISTEN) { dccp_set_state(sk, DCCP_CLOSED); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } sk_stop_timer(sk, &dp->dccps_xmit_timer); /* * We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the *reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { data_was_unread += skb->len; __kfree_skb(skb); } /* If socket has been already reset kill it. */ if (sk->sk_state == DCCP_CLOSED) goto adjudge_to_death; if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_set_state(sk, DCCP_CLOSED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { /* * Normal connection termination. May need to wait if there are * still packets in the TX queue that are delayed by the CCID. */ dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } /* * Flush write queue. This may be necessary in several cases: * - we have been closed by the peer but still have application data; * - abortive termination (unread data or zero linger time), * - normal termination but queue could not be flushed within time limit */ __skb_queue_purge(&sk->sk_write_queue); sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); /* * It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); /* * Now socket is owned by kernel and we acquire BH lock * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); this_cpu_inc(dccp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) goto out; if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } EXPORT_SYMBOL_GPL(dccp_close); void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("called shutdown(%x)\n", how); } EXPORT_SYMBOL_GPL(dccp_shutdown); static inline int __init dccp_mib_init(void) { dccp_statistics = alloc_percpu(struct dccp_mib); if (!dccp_statistics) return -ENOMEM; return 0; } static inline void dccp_mib_exit(void) { free_percpu(dccp_statistics); } static int thash_entries; module_param(thash_entries, int, 0444); MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); #ifdef CONFIG_IP_DCCP_DEBUG bool dccp_debug; module_param(dccp_debug, bool, 0644); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; dccp_hashinfo.bind2_bucket_cachep = kmem_cache_create("dccp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind2_bucket_cachep) goto out_free_bind_bucket_cachep; /* * Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ if (nr_pages >= (128 * 1024)) goal = nr_pages >> (21 - PAGE_SHIFT); else goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) ; do { unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / sizeof(struct inet_ehash_bucket); while (hash_size & (hash_size - 1)) hash_size--; dccp_hashinfo.ehash_mask = hash_size - 1; dccp_hashinfo.ehash = (struct inet_ehash_bucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); } while (!dccp_hashinfo.ehash && --ehash_order > 0); if (!dccp_hashinfo.ehash) { DCCP_CRIT("Failed to allocate DCCP established hash table"); goto out_free_bind2_bucket_cachep; } for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&dccp_hashinfo)) goto out_free_dccp_ehash; bhash_order = ehash_order; do { dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / sizeof(struct inet_bind_hashbucket); if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) continue; dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); } while (!dccp_hashinfo.bhash && --bhash_order >= 0); if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); goto out_free_dccp_locks; } dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); if (!dccp_hashinfo.bhash2) { DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); goto out_free_dccp_bhash; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { spin_lock_init(&dccp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); spin_lock_init(&dccp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } dccp_hashinfo.pernet = false; rc = dccp_mib_init(); if (rc) goto out_free_dccp_bhash2; rc = dccp_ackvec_init(); if (rc) goto out_free_dccp_mib; rc = dccp_sysctl_init(); if (rc) goto out_ackvec_exit; rc = ccid_initialize_builtins(); if (rc) goto out_sysctl_exit; dccp_timestamping_init(); return 0; out_sysctl_exit: dccp_sysctl_exit(); out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: dccp_mib_exit(); out_free_dccp_bhash2: free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); out_free_dccp_locks: inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); out_free_bind2_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); out_fail: dccp_hashinfo.bhash = NULL; dccp_hashinfo.bhash2 = NULL; dccp_hashinfo.ehash = NULL; dccp_hashinfo.bind_bucket_cachep = NULL; dccp_hashinfo.bind2_bucket_cachep = NULL; return rc; } static void __exit dccp_fini(void) { int bhash_order = get_order(dccp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); ccid_cleanup_builtins(); dccp_mib_exit(); free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); free_pages((unsigned long)dccp_hashinfo.ehash, get_order((dccp_hashinfo.ehash_mask + 1) * sizeof(struct inet_ehash_bucket))); inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); inet_hashinfo2_free_mod(&dccp_hashinfo); } module_init(dccp_init); module_exit(dccp_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
33 33 19 18 16 16 18 2 16 7 14 2 2 2 2 2 2 14 33 96 95 35 35 35 35 34 33 33 33 33 33 33 32 32 32 32 32 32 32 32 33 33 33 35 2 33 33 33 33 33 33 33 33 33 33 33 33 33 33 32 38 38 34 35 38 38 62 62 62 62 62 62 1 1 81 62 62 62 62 62 62 62 62 62 62 62 1 62 70 70 70 70 70 70 70 70 70 70 70 70 70 70 79 62 62 62 62 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { /* for default tnl0 device allow to change only the proto */ t->parms.proto = p->proto; netdev_state_change(t->dev); } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; t->net = dev_net(dev); ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; struct ip6_tnl *nt, *t; int err; nt = netdev_priv(dev); if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_local = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) ip6_tnl_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
807 810 1640 1644 119 121 1719 121 1644 92 92 92 92 1719 1718 1712 129 3701 3 3 3 905 2456 777 781 150 649 3 358 358 359 273 272 273 497 364 478 9 268 308 194 1552 1553 248 3 3 443 122 442 146 145 146 145 121 146 3073 3064 193 176 206 2959 2968 2969 2969 2969 55 92 625 1558 1557 265 235 119 267 1708 408 1547 1208 1708 7 7 63 63 200 387 239 60 499 353 329 353 353 352 353 429 429 429 427 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* memcontrol.h - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <balbir@linux.vnet.ibm.com> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> */ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H #include <linux/cgroup.h> #include <linux/vm_event_item.h> #include <linux/hardirq.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> #include <linux/shrinker.h> struct mem_cgroup; struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, MEMCG_KMEM, MEMCG_ZSWAP_B, MEMCG_ZSWAPPED, MEMCG_NR_STAT, }; enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int generation; }; #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 struct mem_cgroup_id { int id; refcount_t ref; }; struct memcg_vmstats_percpu; struct memcg1_events_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; struct lruvec_stats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ atomic_t generation; }; /* * per-node information in memory controller. */ struct mem_cgroup_per_node { /* Keep the read-only fields at the start */ struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; struct shrinker_info __rcu *shrinker_info; #ifdef CONFIG_MEMCG_V1 /* * Memcg-v1 only stuff in middle as buffer between read mostly fields * and update often fields to avoid false sharing. If v1 stuff is * not present, an explicit padding is needed. */ struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; #else CACHELINE_PADDING(_pad1_); #endif /* Fields which get updated often at the end. */ struct lruvec lruvec; CACHELINE_PADDING(_pad2_); unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; }; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; }; /* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ struct mem_cgroup_threshold entries[] __counted_by(size); }; struct mem_cgroup_thresholds { /* Primary thresholds array */ struct mem_cgroup_threshold_ary *primary; /* * Spare threshold array. * This is needed to make mem_cgroup_unregister_event() "never fail". * It must be able to store at least primary->size - 1 entries. */ struct mem_cgroup_threshold_ary *spare; }; /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss * one in a given round, we're likely to catch it later if it keeps * foreign-dirtying, so a fairly low count should be enough. * * See mem_cgroup_track_foreign_dirty_slowpath() for details. */ #define MEMCG_CGWB_FRN_CNT 4 struct memcg_cgwb_frn { u64 bdi_id; /* bdi->id of the foreign inode */ int memcg_id; /* memcg->css.id of foreign inode */ u64 at; /* jiffies_64 at the time of dirtying */ struct wb_completion done; /* tracks in-flight foreign writebacks */ }; /* * Bucket for arbitrarily byte-sized objects charged to a memory * cgroup. The bucket can be reparented in one piece when the cgroup * is destroyed, without having to round up the individual references * of all live memory objects in the wild. */ struct obj_cgroup { struct percpu_ref refcnt; struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. */ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ struct mem_cgroup_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ union { struct page_counter swap; /* v2 only */ struct page_counter memsw; /* v1 only */ }; /* registered local peak watchers */ struct list_head memory_peaks; struct list_head swap_peaks; spinlock_t peaks_lock; /* Range enforcement for interrupt charges */ struct work_struct high_work; #ifdef CONFIG_ZSWAP unsigned long zswap_max; /* * Prevent pages from this memcg from being written back from zswap to * swap, and from being swapped out on zswap store failures. */ bool zswap_writeback; #endif /* vmpressure notifications */ struct vmpressure vmpressure; /* * Should the OOM killer kill all belonging tasks, had it kill one? */ bool oom_group; int swappiness; /* memory.events and memory.events.local */ struct cgroup_file events_file; struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; /* memory.stat */ struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; /* * Hint of reclaim pressure for socket memroy management. Note * that this indicator should NOT be used in legacy cgroup mode * where socket memory is accounted/charged separately. */ unsigned long socket_pressure; int kmemcg_id; /* * memcg->objcg is wiped out as a part of the objcg repaprenting * process. memcg->orig_objcg preserves a pointer (and a reference) * to the original objcg until the end of live of memcg. */ struct obj_cgroup __rcu *objcg; struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif #ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif #ifdef CONFIG_MEMCG_V1 /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ struct memcg1_events_percpu __percpu *events_percpu; unsigned long soft_limit; /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; /* OOM-Killer disable */ int oom_kill_disable; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; #endif /* CONFIG_MEMCG_V1 */ struct mem_cgroup_per_node *nodeinfo[]; }; /* * size of first charge trial. * TODO: maybe necessary to use big numbers in big irons or dynamic based of the * workload. */ #define MEMCG_CHARGE_BATCH 64U extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { /* page->memcg_data is a pointer to an slabobj_ext vector */ MEMCG_DATA_OBJEXTS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { /* slabobj_ext vector failed to allocate */ OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) #ifdef CONFIG_MEMCG static inline bool folio_memcg_kmem(struct folio *folio); /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * * The caller must ensure that the returned memcg won't be released. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex)); return READ_ONCE(objcg->memcg); } /* * __folio_memcg - Get the memory cgroup associated with a non-kmem folio * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * kmem folios. */ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * __folio_objcg - get the object cgroup associated with a kmem folio. * @folio: Pointer to the folio. * * Returns a pointer to the object cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper object cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios or * LRU folios. */ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) { unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* * folio_memcg - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function assumes that the folio is known to have a * proper memory cgroup pointer. It's not safe to call this function * against some type of folios, e.g. slab folios or ex-slab folios. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - exclusive reference * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg(struct folio *folio) { if (folio_memcg_kmem(folio)) return obj_cgroup_memcg(__folio_objcg(folio)); return __folio_memcg(folio); } /* * folio_memcg_charged - If a folio is charged to a memory cgroup. * @folio: Pointer to the folio. * * Returns true if folio is charged to a memory cgroup, otherwise returns false. */ static inline bool folio_memcg_charged(struct folio *folio) { if (folio_memcg_kmem(folio)) return __folio_objcg(folio) != NULL; return __folio_memcg(folio) != NULL; } /* * folio_memcg_check - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. * * Returns a pointer to the memory cgroup associated with the folio, * or NULL. This function unlike folio_memcg() can take any folio * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * * - the folio lock * - LRU isolation * - exclusive reference * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. */ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* * Because folio->memcg_data might be changed asynchronously * for slabs, READ_ONCE() should be used here. */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); return obj_cgroup_memcg(objcg); } return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } static inline struct mem_cgroup *page_memcg_check(struct page *page) { if (PageTail(page)) return NULL; return folio_memcg_check((struct folio *)page); } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; rcu_read_lock(); retry: memcg = obj_cgroup_memcg(objcg); if (unlikely(!css_tryget(&memcg->css))) goto retry; rcu_read_unlock(); return memcg; } /* * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set. * @folio: Pointer to the folio. * * Checks if the folio has MemcgKmem flag set. The caller must ensure * that the folio has an associated memory cgroup. It's not safe to call * this function against some types of folios, e.g. slab folios. */ static inline bool folio_memcg_kmem(struct folio *folio) { VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio); return folio->memcg_data & MEMCG_DATA_KMEM; } static inline bool PageMemcgKmem(struct page *page) { return folio_memcg_kmem(page_folio(page)); } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); } static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; if (mem_cgroup_disabled()) return; /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because * mem_cgroup_calculate_protection is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) * which would want to have effective values 0 for targeted reclaim * but a different value for external reclaim. * * Example * Let's have global and A's reclaim in parallel: * | * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G) * |\ * | C (low = 1G, usage = 2.5G) * B (low = 1G, usage = 0.5G) * * For the global reclaim * A.elow = A.low * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow * C.elow = min(C.usage, C.low) * * With the effective values resetting we have A reclaim * A.elow = 0 * B.elow = B.low * C.elow = C.low * * If the global reclaim races with A's reclaim then * B.elow = C.elow = 0 because children_low_usage > A.elow) * is possible and reclaiming B would be violating the protection. * */ if (root == memcg) return; *min = READ_ONCE(memcg->memory.emin); *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support * protection. The target memcg's protection is ignored, see * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || memcg == target; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= page_counter_read(&memcg->memory); } void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** * mem_cgroup_charge - Charge a newly allocated folio to a cgroup. * @folio: Folio to charge. * @mm: mm context of the allocating task. * @gfp: Reclaim mode. * * Try to charge @folio to the memcg that @mm belongs to, reclaiming * pages according to @gfp if necessary. If @mm is NULL, try to * charge to the active memcg. * * Do not use this for folios allocated for swapin. * * Return: 0 on success. Otherwise, an error code is returned. */ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { if (mem_cgroup_disabled()) return 0; return __mem_cgroup_charge(folio, mm, gfp); } int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); void __mem_cgroup_uncharge(struct folio *folio); /** * mem_cgroup_uncharge - Uncharge a folio. * @folio: Folio to uncharge. * * Uncharge a folio previously charged with mem_cgroup_charge(). */ static inline void mem_cgroup_uncharge(struct folio *folio) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge(folio); } void __mem_cgroup_uncharge_folios(struct folio_batch *folios); static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { if (mem_cgroup_disabled()) return; __mem_cgroup_uncharge_folios(folios); } void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * @pgdat: pglist_data * * Returns the lru list vector holding pages for a given @memcg & * @pgdat combination. This can be the node lruvec, if the memory * controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { lruvec = &pgdat->__lruvec; goto out; } if (!memcg) memcg = root_mem_cgroup; mz = memcg->nodeinfo[pgdat->node_id]; lruvec = &mz->lruvec; out: /* * Since a node can be onlined after the mem_cgroup was created, * we have to be prepared to initialize lruvec->pgdat here; * and if offlined then reonlined, we need to reinitialize it. */ if (unlikely(lruvec->pgdat != pgdat)) lruvec->pgdat = pgdat; return lruvec; } /** * folio_lruvec - return lruvec for isolating/putting an LRU folio * @folio: Pointer to the folio. * * This function relies on folio->mem_cgroup being stable. */ static inline struct lruvec *folio_lruvec(struct folio *folio) { struct mem_cgroup *memcg = folio_memcg(folio); VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio); return mem_cgroup_lruvec(memcg, folio_pgdat(folio)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio); struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flags); #ifdef CONFIG_DEBUG_VM void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio); #else static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } #endif static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { return percpu_ref_tryget(&objcg->refcnt); } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { percpu_ref_get(&objcg->refcnt); } static inline void obj_cgroup_get_many(struct obj_cgroup *objcg, unsigned long nr) { percpu_ref_get_many(&objcg->refcnt, nr); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { if (objcg) percpu_ref_put(&objcg->refcnt); } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return !memcg || css_tryget(&memcg->css); } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return !memcg || css_tryget_online(&memcg->css); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*)(struct task_struct *, void *), void *arg); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; if (mem_cgroup_disabled()) return NULL; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * * Returns the parent memcg, or NULL if this is the root. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return mem_cgroup_from_css(memcg->css.parent); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { if (root == memcg) return true; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; bool match = false; rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (task_memcg) match = mem_cgroup_is_descendant(task_memcg, memcg); rcu_read_unlock(); return match; } struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return true; return !!(memcg->css.flags & CSS_ONLINE); } void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { struct mem_cgroup_per_node *mz; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); unsigned long mem_cgroup_size(struct mem_cgroup *memcg); void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_memcg_state(memcg, idx, val); local_irq_restore(flags); } static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { unsigned long flags; local_irq_save(flags); __count_memcg_events(memcg, idx, count); local_irq_restore(flags); } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { struct mem_cgroup *memcg = folio_memcg(folio); if (memcg) count_memcg_events(memcg, idx, nr); } static inline void count_memcg_events_mm(struct mm_struct *mm, enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) count_memcg_events(memcg, idx, count); rcu_read_unlock(); } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { count_memcg_events_mm(mm, idx, 1); } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; atomic_long_inc(&memcg->memory_events_local[event]); if (!swap_event) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); if (swap_event) cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) memcg_memory_event(memcg, event); rcu_read_unlock(); } void split_page_memcg(struct page *head, int old_order, int new_order); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; } static inline bool folio_memcg_charged(struct folio *folio) { return false; } static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { return NULL; } static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { return NULL; } static inline bool folio_memcg_kmem(struct folio *folio) { return false; } static inline bool PageMemcgKmem(struct page *page) { return false; } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_disabled(void) { return true; } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg) { } static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_below_low(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline bool mem_cgroup_below_min(struct mem_cgroup *target, struct mem_cgroup *memcg) { return false; } static inline void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) { } static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages) { return 0; } static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; } static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) { } static inline void mem_cgroup_uncharge(struct folio *folio) { } static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { } static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { } static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); return &pgdat->__lruvec; } static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) { } static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { return true; } static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_current(void) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { return NULL; } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) { return true; } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } static inline struct lruvec *folio_lruvec_lock(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irq(&pgdat->__lruvec.lru_lock); return &pgdat->__lruvec; } static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, unsigned long *flagsp) { struct pglist_data *pgdat = folio_pgdat(folio); spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp); return &pgdat->__lruvec; } static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { return NULL; } static inline void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { } static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(id); /* XXX: This should always return root_mem_cgroup */ return NULL; } #ifdef CONFIG_SHRINKER_DEBUG static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } #endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; } static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { return 0; } static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { return 0; } static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { } static inline void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { return NULL; } static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } static inline void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) { } static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) { } static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { } static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); mod_node_page_state(page_pgdat(page), idx, val); } static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { } static inline void count_memcg_events_mm(struct mm_struct *mm, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } #endif /* CONFIG_MEMCG */ /* * Extended information for slab objects stored as an array in page->memcg_data * if MEMCG_DATA_OBJEXTS is set. */ struct slabobj_ext { #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif #ifdef CONFIG_MEM_ALLOC_PROFILING union codetag_ref ref; #endif } __aligned(8); static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, 1); } static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; memcg = lruvec_memcg(lruvec); if (!memcg) return NULL; memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } static inline void unlock_page_lruvec(struct lruvec *lruvec) { spin_unlock(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irq(struct lruvec *lruvec) { spin_unlock_irq(&lruvec->lru_lock); } static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, unsigned long flags) { spin_unlock_irqrestore(&lruvec->lru_lock, flags); } /* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { return lruvec_pgdat(lruvec) == folio_pgdat(folio) && lruvec_memcg(lruvec) == folio_memcg(folio); } /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio, struct lruvec *locked_lruvec) { if (locked_lruvec) { if (folio_matches_lruvec(folio, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); } return folio_lruvec_lock_irq(folio); } /* Don't lock again iff folio's lruvec locked */ static inline void folio_lruvec_relock_irqsave(struct folio *folio, struct lruvec **lruvecp, unsigned long *flags) { if (*lruvecp) { if (folio_matches_lruvec(folio, *lruvecp)) return; unlock_page_lruvec_irqrestore(*lruvecp, *flags); } *lruvecp = folio_lruvec_lock_irqsave(folio, flags); } #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, struct bdi_writeback *wb); static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; memcg = folio_memcg(folio); if (unlikely(memcg && &memcg->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(folio, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); #else /* CONFIG_CGROUP_WRITEBACK */ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { return NULL; } static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback) { } static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, struct bdi_writeback *wb) { } static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { #ifdef CONFIG_MEMCG_V1 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return !!memcg->tcpmem_pressure; #endif /* CONFIG_MEMCG_V1 */ do { if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; } int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } #endif #ifdef CONFIG_MEMCG bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); /* * The returned objcg pointer is safe to use without additional * protection within a scope. The scope is defined either by * the current task (similar to the "current" global variable) * or by set_active_memcg() pair. * Please, use obj_cgroup_get() to get a reference if the pointer * needs to be used outside of the local scope. */ struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); static inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct obj_cgroup *objcg = current_obj_cgroup(); if (objcg) obj_cgroup_get(objcg); return objcg; } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); extern struct static_key_false memcg_bpf_enabled_key; static inline bool memcg_bpf_enabled(void) { return static_branch_likely(&memcg_bpf_enabled_key); } extern struct static_key_false memcg_kmem_online_key; static inline bool memcg_kmem_online(void) { return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } /* * A helper for accessing memcg's kmem_id, used for getting * corresponding LRU lists. */ static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return memcg ? memcg->kmemcg_id : -1; } struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; if (!memcg_kmem_online()) return; rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); count_memcg_events(memcg, idx, count); rcu_read_unlock(); } #else static inline bool mem_cgroup_kmem_disabled(void) { return true; } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { } static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } static inline bool memcg_bpf_enabled(void) { return false; } static inline bool memcg_kmem_online(void) { return false; } static inline int memcg_kmem_id(struct mem_cgroup *memcg) { return -1; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; } static inline void count_objcg_events(struct obj_cgroup *objcg, enum vm_event_item idx, unsigned long count) { } #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { return true; } static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) { /* if zswap is disabled, do not block pages going to the swapping device */ return true; } #endif /* Cgroup v1-related declarations */ #ifdef CONFIG_MEMCG_V1 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); bool mem_cgroup_oom_synchronize(bool wait); static inline bool task_in_memcg_oom(struct task_struct *p) { return p->memcg_in_oom; } static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); current->in_user_fault = 1; } static inline void mem_cgroup_exit_user_fault(void) { WARN_ON(!current->in_user_fault); current->in_user_fault = 0; } #else /* CONFIG_MEMCG_V1 */ static inline unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { return 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { return false; } static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } static inline void mem_cgroup_enter_user_fault(void) { } static inline void mem_cgroup_exit_user_fault(void) { } #endif /* CONFIG_MEMCG_V1 */ #endif /* _LINUX_MEMCONTROL_H */
1720 1690 1710 154 859 267 409 409 397 409 1721 1718 1713 1632 2784 2596 1717 50 194 1714 1297 421 200 1715 421 822 1690 1081 1687 1717 823 793 825 1693 1687 1689 1704 1701 1633 1083 179 92 1043 174 18 172 1689 1695 1695 1 1693 833 822 822 825 825 547 825 834 420 104 104 1705 1701 255 1 1 836 833 87 86 484 1 2039 2959 2039 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment in folio_lru_refs() */ return order_base_2(refs + 1); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); bool workingset = flags & BIT(PG_workingset); /* * Return the number of accesses beyond PG_referenced, i.e., N-1 if the * total number of accesses is N>1, since N=0,1 both map to the first * tier. lru_tier_from_refs() will account for this off-by-one. Also see * the comment on MAX_NR_TIERS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; } static inline void folio_clear_lru_refs(struct folio *folio) { set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; /* * There are four common cases for this page: * 1. If it's hot, i.e., freshly faulted in, add it to the youngest * generation, and it's protected over the rest below. * 2. If it can't be evicted immediately, i.e., a dirty page pending * writeback, add it to the second youngest generation. * 3. If it should be evicted first, e.g., cold and clean from * folio_rotate_reclaimable(), add it to the oldest generation. * 4. Everything else falls between 2 & 3 above and is added to the * second oldest generation if it's considered inactive, or the * oldest generation otherwise. See lru_gen_is_active(). */ if (folio_test_active(folio)) seq = lrugen->max_seq; else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) seq = lrugen->max_seq - 1; else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) seq = lrugen->min_seq[type]; else seq = lrugen->min_seq[type] + 1; gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ mask = LRU_GEN_MASK; /* * Don't clear PG_workingset here because it can affect PSI accounting * if the activation is due to workingset refault. */ if (folio_test_active(folio)) mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); set_mask_bits(&folio->flags, mask, flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. */ static inline void pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); #endif } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif
1 3 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 // SPDX-License-Identifier: GPL-2.0-only /* * Framework for buffer objects that can be shared across devices/subsystems. * * Copyright(C) 2011 Linaro Limited. All rights reserved. * Author: Sumit Semwal <sumit.semwal@ti.com> * * Many thanks to linaro-mm-sig list, and specially * Arnd Bergmann <arnd@arndb.de>, Rob Clark <rob@ti.com> and * Daniel Vetter <daniel@ffwll.ch> for their support in creation and * refining of this idea. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/dma-buf.h> #include <linux/dma-fence.h> #include <linux/dma-fence-unwrap.h> #include <linux/anon_inodes.h> #include <linux/export.h> #include <linux/debugfs.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/sync_file.h> #include <linux/poll.h> #include <linux/dma-resv.h> #include <linux/mm.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <uapi/linux/dma-buf.h> #include <uapi/linux/magic.h> #include "dma-buf-sysfs-stats.h" static inline int is_dma_buf_file(struct file *); #if IS_ENABLED(CONFIG_DEBUG_FS) static DEFINE_MUTEX(debugfs_list_mutex); static LIST_HEAD(debugfs_list); static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf) { mutex_lock(&debugfs_list_mutex); list_add(&dmabuf->list_node, &debugfs_list); mutex_unlock(&debugfs_list_mutex); } static void __dma_buf_debugfs_list_del(struct dma_buf *dmabuf) { if (!dmabuf) return; mutex_lock(&debugfs_list_mutex); list_del(&dmabuf->list_node); mutex_unlock(&debugfs_list_mutex); } #else static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf) { } static void __dma_buf_debugfs_list_del(struct file *file) { } #endif static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) { struct dma_buf *dmabuf; char name[DMA_BUF_NAME_LEN]; ssize_t ret = 0; dmabuf = dentry->d_fsdata; spin_lock(&dmabuf->name_lock); if (dmabuf->name) ret = strscpy(name, dmabuf->name, sizeof(name)); spin_unlock(&dmabuf->name_lock); return dynamic_dname(buffer, buflen, "/%s:%s", dentry->d_name.name, ret > 0 ? name : ""); } static void dma_buf_release(struct dentry *dentry) { struct dma_buf *dmabuf; dmabuf = dentry->d_fsdata; if (unlikely(!dmabuf)) return; BUG_ON(dmabuf->vmapping_counter); /* * If you hit this BUG() it could mean: * * There's a file reference imbalance in dma_buf_poll / dma_buf_poll_cb or somewhere else * * dmabuf->cb_in/out.active are non-0 despite no pending fence callback */ BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active); dma_buf_stats_teardown(dmabuf); dmabuf->ops->release(dmabuf); if (dmabuf->resv == (struct dma_resv *)&dmabuf[1]) dma_resv_fini(dmabuf->resv); WARN_ON(!list_empty(&dmabuf->attachments)); module_put(dmabuf->owner); kfree(dmabuf->name); kfree(dmabuf); } static int dma_buf_file_release(struct inode *inode, struct file *file) { if (!is_dma_buf_file(file)) return -EINVAL; __dma_buf_debugfs_list_del(file->private_data); return 0; } static const struct dentry_operations dma_buf_dentry_ops = { .d_dname = dmabuffs_dname, .d_release = dma_buf_release, }; static struct vfsmount *dma_buf_mnt; static int dma_buf_fs_init_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, DMA_BUF_MAGIC); if (!ctx) return -ENOMEM; ctx->dops = &dma_buf_dentry_ops; return 0; } static struct file_system_type dma_buf_fs_type = { .name = "dmabuf", .init_fs_context = dma_buf_fs_init_context, .kill_sb = kill_anon_super, }; static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma) { struct dma_buf *dmabuf; if (!is_dma_buf_file(file)) return -EINVAL; dmabuf = file->private_data; /* check if buffer supports mmap */ if (!dmabuf->ops->mmap) return -EINVAL; /* check for overflowing the buffer's size */ if (vma->vm_pgoff + vma_pages(vma) > dmabuf->size >> PAGE_SHIFT) return -EINVAL; return dmabuf->ops->mmap(dmabuf, vma); } static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence) { struct dma_buf *dmabuf; loff_t base; if (!is_dma_buf_file(file)) return -EBADF; dmabuf = file->private_data; /* only support discovering the end of the buffer, * but also allow SEEK_SET to maintain the idiomatic * SEEK_END(0), SEEK_CUR(0) pattern. */ if (whence == SEEK_END) base = dmabuf->size; else if (whence == SEEK_SET) base = 0; else return -EINVAL; if (offset != 0) return -EINVAL; return base + offset; } /** * DOC: implicit fence polling * * To support cross-device and cross-driver synchronization of buffer access * implicit fences (represented internally in the kernel with &struct dma_fence) * can be attached to a &dma_buf. The glue for that and a few related things are * provided in the &dma_resv structure. * * Userspace can query the state of these implicitly tracked fences using poll() * and related system calls: * * - Checking for EPOLLIN, i.e. read access, can be use to query the state of the * most recent write or exclusive fence. * * - Checking for EPOLLOUT, i.e. write access, can be used to query the state of * all attached fences, shared and exclusive ones. * * Note that this only signals the completion of the respective fences, i.e. the * DMA transfers are complete. Cache flushing and any other necessary * preparations before CPU access can begin still need to happen. * * As an alternative to poll(), the set of fences on DMA buffer can be * exported as a &sync_file using &dma_buf_sync_file_export. */ static void dma_buf_poll_cb(struct dma_fence *fence, struct dma_fence_cb *cb) { struct dma_buf_poll_cb_t *dcb = (struct dma_buf_poll_cb_t *)cb; struct dma_buf *dmabuf = container_of(dcb->poll, struct dma_buf, poll); unsigned long flags; spin_lock_irqsave(&dcb->poll->lock, flags); wake_up_locked_poll(dcb->poll, dcb->active); dcb->active = 0; spin_unlock_irqrestore(&dcb->poll->lock, flags); dma_fence_put(fence); /* Paired with get_file in dma_buf_poll */ fput(dmabuf->file); } static bool dma_buf_poll_add_cb(struct dma_resv *resv, bool write, struct dma_buf_poll_cb_t *dcb) { struct dma_resv_iter cursor; struct dma_fence *fence; int r; dma_resv_for_each_fence(&cursor, resv, dma_resv_usage_rw(write), fence) { dma_fence_get(fence); r = dma_fence_add_callback(fence, &dcb->cb, dma_buf_poll_cb); if (!r) return true; dma_fence_put(fence); } return false; } static __poll_t dma_buf_poll(struct file *file, poll_table *poll) { struct dma_buf *dmabuf; struct dma_resv *resv; __poll_t events; dmabuf = file->private_data; if (!dmabuf || !dmabuf->resv) return EPOLLERR; resv = dmabuf->resv; poll_wait(file, &dmabuf->poll, poll); events = poll_requested_events(poll) & (EPOLLIN | EPOLLOUT); if (!events) return 0; dma_resv_lock(resv, NULL); if (events & EPOLLOUT) { struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_out; /* Check that callback isn't busy */ spin_lock_irq(&dmabuf->poll.lock); if (dcb->active) events &= ~EPOLLOUT; else dcb->active = EPOLLOUT; spin_unlock_irq(&dmabuf->poll.lock); if (events & EPOLLOUT) { /* Paired with fput in dma_buf_poll_cb */ get_file(dmabuf->file); if (!dma_buf_poll_add_cb(resv, true, dcb)) /* No callback queued, wake up any other waiters */ dma_buf_poll_cb(NULL, &dcb->cb); else events &= ~EPOLLOUT; } } if (events & EPOLLIN) { struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_in; /* Check that callback isn't busy */ spin_lock_irq(&dmabuf->poll.lock); if (dcb->active) events &= ~EPOLLIN; else dcb->active = EPOLLIN; spin_unlock_irq(&dmabuf->poll.lock); if (events & EPOLLIN) { /* Paired with fput in dma_buf_poll_cb */ get_file(dmabuf->file); if (!dma_buf_poll_add_cb(resv, false, dcb)) /* No callback queued, wake up any other waiters */ dma_buf_poll_cb(NULL, &dcb->cb); else events &= ~EPOLLIN; } } dma_resv_unlock(resv); return events; } /** * dma_buf_set_name - Set a name to a specific dma_buf to track the usage. * It could support changing the name of the dma-buf if the same * piece of memory is used for multiple purpose between different devices. * * @dmabuf: [in] dmabuf buffer that will be renamed. * @buf: [in] A piece of userspace memory that contains the name of * the dma-buf. * * Returns 0 on success. If the dma-buf buffer is already attached to * devices, return -EBUSY. * */ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) { char *name = strndup_user(buf, DMA_BUF_NAME_LEN); if (IS_ERR(name)) return PTR_ERR(name); spin_lock(&dmabuf->name_lock); kfree(dmabuf->name); dmabuf->name = name; spin_unlock(&dmabuf->name_lock); return 0; } #if IS_ENABLED(CONFIG_SYNC_FILE) static long dma_buf_export_sync_file(struct dma_buf *dmabuf, void __user *user_data) { struct dma_buf_export_sync_file arg; enum dma_resv_usage usage; struct dma_fence *fence = NULL; struct sync_file *sync_file; int fd, ret; if (copy_from_user(&arg, user_data, sizeof(arg))) return -EFAULT; if (arg.flags & ~DMA_BUF_SYNC_RW) return -EINVAL; if ((arg.flags & DMA_BUF_SYNC_RW) == 0) return -EINVAL; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; usage = dma_resv_usage_rw(arg.flags & DMA_BUF_SYNC_WRITE); ret = dma_resv_get_singleton(dmabuf->resv, usage, &fence); if (ret) goto err_put_fd; if (!fence) fence = dma_fence_get_stub(); sync_file = sync_file_create(fence); dma_fence_put(fence); if (!sync_file) { ret = -ENOMEM; goto err_put_fd; } arg.fd = fd; if (copy_to_user(user_data, &arg, sizeof(arg))) { ret = -EFAULT; goto err_put_file; } fd_install(fd, sync_file->file); return 0; err_put_file: fput(sync_file->file); err_put_fd: put_unused_fd(fd); return ret; } static long dma_buf_import_sync_file(struct dma_buf *dmabuf, const void __user *user_data) { struct dma_buf_import_sync_file arg; struct dma_fence *fence, *f; enum dma_resv_usage usage; struct dma_fence_unwrap iter; unsigned int num_fences; int ret = 0; if (copy_from_user(&arg, user_data, sizeof(arg))) return -EFAULT; if (arg.flags & ~DMA_BUF_SYNC_RW) return -EINVAL; if ((arg.flags & DMA_BUF_SYNC_RW) == 0) return -EINVAL; fence = sync_file_get_fence(arg.fd); if (!fence) return -EINVAL; usage = (arg.flags & DMA_BUF_SYNC_WRITE) ? DMA_RESV_USAGE_WRITE : DMA_RESV_USAGE_READ; num_fences = 0; dma_fence_unwrap_for_each(f, &iter, fence) ++num_fences; if (num_fences > 0) { dma_resv_lock(dmabuf->resv, NULL); ret = dma_resv_reserve_fences(dmabuf->resv, num_fences); if (!ret) { dma_fence_unwrap_for_each(f, &iter, fence) dma_resv_add_fence(dmabuf->resv, f, usage); } dma_resv_unlock(dmabuf->resv); } dma_fence_put(fence); return ret; } #endif static long dma_buf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct dma_buf *dmabuf; struct dma_buf_sync sync; enum dma_data_direction direction; int ret; dmabuf = file->private_data; switch (cmd) { case DMA_BUF_IOCTL_SYNC: if (copy_from_user(&sync, (void __user *) arg, sizeof(sync))) return -EFAULT; if (sync.flags & ~DMA_BUF_SYNC_VALID_FLAGS_MASK) return -EINVAL; switch (sync.flags & DMA_BUF_SYNC_RW) { case DMA_BUF_SYNC_READ: direction = DMA_FROM_DEVICE; break; case DMA_BUF_SYNC_WRITE: direction = DMA_TO_DEVICE; break; case DMA_BUF_SYNC_RW: direction = DMA_BIDIRECTIONAL; break; default: return -EINVAL; } if (sync.flags & DMA_BUF_SYNC_END) ret = dma_buf_end_cpu_access(dmabuf, direction); else ret = dma_buf_begin_cpu_access(dmabuf, direction); return ret; case DMA_BUF_SET_NAME_A: case DMA_BUF_SET_NAME_B: return dma_buf_set_name(dmabuf, (const char __user *)arg); #if IS_ENABLED(CONFIG_SYNC_FILE) case DMA_BUF_IOCTL_EXPORT_SYNC_FILE: return dma_buf_export_sync_file(dmabuf, (void __user *)arg); case DMA_BUF_IOCTL_IMPORT_SYNC_FILE: return dma_buf_import_sync_file(dmabuf, (const void __user *)arg); #endif default: return -ENOTTY; } } static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) { struct dma_buf *dmabuf = file->private_data; seq_printf(m, "size:\t%zu\n", dmabuf->size); /* Don't count the temporary reference taken inside procfs seq_show */ seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1); seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name); spin_lock(&dmabuf->name_lock); if (dmabuf->name) seq_printf(m, "name:\t%s\n", dmabuf->name); spin_unlock(&dmabuf->name_lock); } static const struct file_operations dma_buf_fops = { .release = dma_buf_file_release, .mmap = dma_buf_mmap_internal, .llseek = dma_buf_llseek, .poll = dma_buf_poll, .unlocked_ioctl = dma_buf_ioctl, .compat_ioctl = compat_ptr_ioctl, .show_fdinfo = dma_buf_show_fdinfo, }; /* * is_dma_buf_file - Check if struct file* is associated with dma_buf */ static inline int is_dma_buf_file(struct file *file) { return file->f_op == &dma_buf_fops; } static struct file *dma_buf_getfile(size_t size, int flags) { static atomic64_t dmabuf_inode = ATOMIC64_INIT(0); struct inode *inode = alloc_anon_inode(dma_buf_mnt->mnt_sb); struct file *file; if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_size = size; inode_set_bytes(inode, size); /* * The ->i_ino acquired from get_next_ino() is not unique thus * not suitable for using it as dentry name by dmabuf stats. * Override ->i_ino with the unique and dmabuffs specific * value. */ inode->i_ino = atomic64_inc_return(&dmabuf_inode); flags &= O_ACCMODE | O_NONBLOCK; file = alloc_file_pseudo(inode, dma_buf_mnt, "dmabuf", flags, &dma_buf_fops); if (IS_ERR(file)) goto err_alloc_file; return file; err_alloc_file: iput(inode); return file; } /** * DOC: dma buf device access * * For device DMA access to a shared DMA buffer the usual sequence of operations * is fairly simple: * * 1. The exporter defines his exporter instance using * DEFINE_DMA_BUF_EXPORT_INFO() and calls dma_buf_export() to wrap a private * buffer object into a &dma_buf. It then exports that &dma_buf to userspace * as a file descriptor by calling dma_buf_fd(). * * 2. Userspace passes this file-descriptors to all drivers it wants this buffer * to share with: First the file descriptor is converted to a &dma_buf using * dma_buf_get(). Then the buffer is attached to the device using * dma_buf_attach(). * * Up to this stage the exporter is still free to migrate or reallocate the * backing storage. * * 3. Once the buffer is attached to all devices userspace can initiate DMA * access to the shared buffer. In the kernel this is done by calling * dma_buf_map_attachment() and dma_buf_unmap_attachment(). * * 4. Once a driver is done with a shared buffer it needs to call * dma_buf_detach() (after cleaning up any mappings) and then release the * reference acquired with dma_buf_get() by calling dma_buf_put(). * * For the detailed semantics exporters are expected to implement see * &dma_buf_ops. */ /** * dma_buf_export - Creates a new dma_buf, and associates an anon file * with this buffer, so it can be exported. * Also connect the allocator specific data and ops to the buffer. * Additionally, provide a name string for exporter; useful in debugging. * * @exp_info: [in] holds all the export related information provided * by the exporter. see &struct dma_buf_export_info * for further details. * * Returns, on success, a newly created struct dma_buf object, which wraps the * supplied private data and operations for struct dma_buf_ops. On either * missing ops, or error in allocating struct dma_buf, will return negative * error. * * For most cases the easiest way to create @exp_info is through the * %DEFINE_DMA_BUF_EXPORT_INFO macro. */ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) { struct dma_buf *dmabuf; struct dma_resv *resv = exp_info->resv; struct file *file; size_t alloc_size = sizeof(struct dma_buf); int ret; if (WARN_ON(!exp_info->priv || !exp_info->ops || !exp_info->ops->map_dma_buf || !exp_info->ops->unmap_dma_buf || !exp_info->ops->release)) return ERR_PTR(-EINVAL); if (WARN_ON(exp_info->ops->cache_sgt_mapping && (exp_info->ops->pin || exp_info->ops->unpin))) return ERR_PTR(-EINVAL); if (WARN_ON(!exp_info->ops->pin != !exp_info->ops->unpin)) return ERR_PTR(-EINVAL); if (!try_module_get(exp_info->owner)) return ERR_PTR(-ENOENT); file = dma_buf_getfile(exp_info->size, exp_info->flags); if (IS_ERR(file)) { ret = PTR_ERR(file); goto err_module; } if (!exp_info->resv) alloc_size += sizeof(struct dma_resv); else /* prevent &dma_buf[1] == dma_buf->resv */ alloc_size += 1; dmabuf = kzalloc(alloc_size, GFP_KERNEL); if (!dmabuf) { ret = -ENOMEM; goto err_file; } dmabuf->priv = exp_info->priv; dmabuf->ops = exp_info->ops; dmabuf->size = exp_info->size; dmabuf->exp_name = exp_info->exp_name; dmabuf->owner = exp_info->owner; spin_lock_init(&dmabuf->name_lock); init_waitqueue_head(&dmabuf->poll); dmabuf->cb_in.poll = dmabuf->cb_out.poll = &dmabuf->poll; dmabuf->cb_in.active = dmabuf->cb_out.active = 0; INIT_LIST_HEAD(&dmabuf->attachments); if (!resv) { dmabuf->resv = (struct dma_resv *)&dmabuf[1]; dma_resv_init(dmabuf->resv); } else { dmabuf->resv = resv; } ret = dma_buf_stats_setup(dmabuf, file); if (ret) goto err_dmabuf; file->private_data = dmabuf; file->f_path.dentry->d_fsdata = dmabuf; dmabuf->file = file; __dma_buf_debugfs_list_add(dmabuf); return dmabuf; err_dmabuf: if (!resv) dma_resv_fini(dmabuf->resv); kfree(dmabuf); err_file: fput(file); err_module: module_put(exp_info->owner); return ERR_PTR(ret); } EXPORT_SYMBOL_NS_GPL(dma_buf_export, DMA_BUF); /** * dma_buf_fd - returns a file descriptor for the given struct dma_buf * @dmabuf: [in] pointer to dma_buf for which fd is required. * @flags: [in] flags to give to fd * * On success, returns an associated 'fd'. Else, returns error. */ int dma_buf_fd(struct dma_buf *dmabuf, int flags) { int fd; if (!dmabuf || !dmabuf->file) return -EINVAL; fd = get_unused_fd_flags(flags); if (fd < 0) return fd; fd_install(fd, dmabuf->file); return fd; } EXPORT_SYMBOL_NS_GPL(dma_buf_fd, DMA_BUF); /** * dma_buf_get - returns the struct dma_buf related to an fd * @fd: [in] fd associated with the struct dma_buf to be returned * * On success, returns the struct dma_buf associated with an fd; uses * file's refcounting done by fget to increase refcount. returns ERR_PTR * otherwise. */ struct dma_buf *dma_buf_get(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (!is_dma_buf_file(file)) { fput(file); return ERR_PTR(-EINVAL); } return file->private_data; } EXPORT_SYMBOL_NS_GPL(dma_buf_get, DMA_BUF); /** * dma_buf_put - decreases refcount of the buffer * @dmabuf: [in] buffer to reduce refcount of * * Uses file's refcounting done implicitly by fput(). * * If, as a result of this call, the refcount becomes 0, the 'release' file * operation related to this fd is called. It calls &dma_buf_ops.release vfunc * in turn, and frees the memory allocated for dmabuf when exported. */ void dma_buf_put(struct dma_buf *dmabuf) { if (WARN_ON(!dmabuf || !dmabuf->file)) return; fput(dmabuf->file); } EXPORT_SYMBOL_NS_GPL(dma_buf_put, DMA_BUF); static void mangle_sg_table(struct sg_table *sg_table) { #ifdef CONFIG_DMABUF_DEBUG int i; struct scatterlist *sg; /* To catch abuse of the underlying struct page by importers mix * up the bits, but take care to preserve the low SG_ bits to * not corrupt the sgt. The mixing is undone in __unmap_dma_buf * before passing the sgt back to the exporter. */ for_each_sgtable_sg(sg_table, sg, i) sg->page_link ^= ~0xffUL; #endif } static struct sg_table *__map_dma_buf(struct dma_buf_attachment *attach, enum dma_data_direction direction) { struct sg_table *sg_table; signed long ret; sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction); if (IS_ERR_OR_NULL(sg_table)) return sg_table; if (!dma_buf_attachment_is_dynamic(attach)) { ret = dma_resv_wait_timeout(attach->dmabuf->resv, DMA_RESV_USAGE_KERNEL, true, MAX_SCHEDULE_TIMEOUT); if (ret < 0) { attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); return ERR_PTR(ret); } } mangle_sg_table(sg_table); return sg_table; } /** * DOC: locking convention * * In order to avoid deadlock situations between dma-buf exports and importers, * all dma-buf API users must follow the common dma-buf locking convention. * * Convention for importers * * 1. Importers must hold the dma-buf reservation lock when calling these * functions: * * - dma_buf_pin() * - dma_buf_unpin() * - dma_buf_map_attachment() * - dma_buf_unmap_attachment() * - dma_buf_vmap() * - dma_buf_vunmap() * * 2. Importers must not hold the dma-buf reservation lock when calling these * functions: * * - dma_buf_attach() * - dma_buf_dynamic_attach() * - dma_buf_detach() * - dma_buf_export() * - dma_buf_fd() * - dma_buf_get() * - dma_buf_put() * - dma_buf_mmap() * - dma_buf_begin_cpu_access() * - dma_buf_end_cpu_access() * - dma_buf_map_attachment_unlocked() * - dma_buf_unmap_attachment_unlocked() * - dma_buf_vmap_unlocked() * - dma_buf_vunmap_unlocked() * * Convention for exporters * * 1. These &dma_buf_ops callbacks are invoked with unlocked dma-buf * reservation and exporter can take the lock: * * - &dma_buf_ops.attach() * - &dma_buf_ops.detach() * - &dma_buf_ops.release() * - &dma_buf_ops.begin_cpu_access() * - &dma_buf_ops.end_cpu_access() * - &dma_buf_ops.mmap() * * 2. These &dma_buf_ops callbacks are invoked with locked dma-buf * reservation and exporter can't take the lock: * * - &dma_buf_ops.pin() * - &dma_buf_ops.unpin() * - &dma_buf_ops.map_dma_buf() * - &dma_buf_ops.unmap_dma_buf() * - &dma_buf_ops.vmap() * - &dma_buf_ops.vunmap() * * 3. Exporters must hold the dma-buf reservation lock when calling these * functions: * * - dma_buf_move_notify() */ /** * dma_buf_dynamic_attach - Add the device to dma_buf's attachments list * @dmabuf: [in] buffer to attach device to. * @dev: [in] device to be attached. * @importer_ops: [in] importer operations for the attachment * @importer_priv: [in] importer private pointer for the attachment * * Returns struct dma_buf_attachment pointer for this attachment. Attachments * must be cleaned up by calling dma_buf_detach(). * * Optionally this calls &dma_buf_ops.attach to allow device-specific attach * functionality. * * Returns: * * A pointer to newly created &dma_buf_attachment on success, or a negative * error code wrapped into a pointer on failure. * * Note that this can fail if the backing storage of @dmabuf is in a place not * accessible to @dev, and cannot be moved to a more suitable place. This is * indicated with the error code -EBUSY. */ struct dma_buf_attachment * dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, const struct dma_buf_attach_ops *importer_ops, void *importer_priv) { struct dma_buf_attachment *attach; int ret; if (WARN_ON(!dmabuf || !dev)) return ERR_PTR(-EINVAL); if (WARN_ON(importer_ops && !importer_ops->move_notify)) return ERR_PTR(-EINVAL); attach = kzalloc(sizeof(*attach), GFP_KERNEL); if (!attach) return ERR_PTR(-ENOMEM); attach->dev = dev; attach->dmabuf = dmabuf; if (importer_ops) attach->peer2peer = importer_ops->allow_peer2peer; attach->importer_ops = importer_ops; attach->importer_priv = importer_priv; if (dmabuf->ops->attach) { ret = dmabuf->ops->attach(dmabuf, attach); if (ret) goto err_attach; } dma_resv_lock(dmabuf->resv, NULL); list_add(&attach->node, &dmabuf->attachments); dma_resv_unlock(dmabuf->resv); /* When either the importer or the exporter can't handle dynamic * mappings we cache the mapping here to avoid issues with the * reservation object lock. */ if (dma_buf_attachment_is_dynamic(attach) != dma_buf_is_dynamic(dmabuf)) { struct sg_table *sgt; dma_resv_lock(attach->dmabuf->resv, NULL); if (dma_buf_is_dynamic(attach->dmabuf)) { ret = dmabuf->ops->pin(attach); if (ret) goto err_unlock; } sgt = __map_dma_buf(attach, DMA_BIDIRECTIONAL); if (!sgt) sgt = ERR_PTR(-ENOMEM); if (IS_ERR(sgt)) { ret = PTR_ERR(sgt); goto err_unpin; } dma_resv_unlock(attach->dmabuf->resv); attach->sgt = sgt; attach->dir = DMA_BIDIRECTIONAL; } return attach; err_attach: kfree(attach); return ERR_PTR(ret); err_unpin: if (dma_buf_is_dynamic(attach->dmabuf)) dmabuf->ops->unpin(attach); err_unlock: dma_resv_unlock(attach->dmabuf->resv); dma_buf_detach(dmabuf, attach); return ERR_PTR(ret); } EXPORT_SYMBOL_NS_GPL(dma_buf_dynamic_attach, DMA_BUF); /** * dma_buf_attach - Wrapper for dma_buf_dynamic_attach * @dmabuf: [in] buffer to attach device to. * @dev: [in] device to be attached. * * Wrapper to call dma_buf_dynamic_attach() for drivers which still use a static * mapping. */ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev) { return dma_buf_dynamic_attach(dmabuf, dev, NULL, NULL); } EXPORT_SYMBOL_NS_GPL(dma_buf_attach, DMA_BUF); static void __unmap_dma_buf(struct dma_buf_attachment *attach, struct sg_table *sg_table, enum dma_data_direction direction) { /* uses XOR, hence this unmangles */ mangle_sg_table(sg_table); attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); } /** * dma_buf_detach - Remove the given attachment from dmabuf's attachments list * @dmabuf: [in] buffer to detach from. * @attach: [in] attachment to be detached; is free'd after this call. * * Clean up a device attachment obtained by calling dma_buf_attach(). * * Optionally this calls &dma_buf_ops.detach for device-specific detach. */ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach) { if (WARN_ON(!dmabuf || !attach || dmabuf != attach->dmabuf)) return; dma_resv_lock(dmabuf->resv, NULL); if (attach->sgt) { __unmap_dma_buf(attach, attach->sgt, attach->dir); if (dma_buf_is_dynamic(attach->dmabuf)) dmabuf->ops->unpin(attach); } list_del(&attach->node); dma_resv_unlock(dmabuf->resv); if (dmabuf->ops->detach) dmabuf->ops->detach(dmabuf, attach); kfree(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_detach, DMA_BUF); /** * dma_buf_pin - Lock down the DMA-buf * @attach: [in] attachment which should be pinned * * Only dynamic importers (who set up @attach with dma_buf_dynamic_attach()) may * call this, and only for limited use cases like scanout and not for temporary * pin operations. It is not permitted to allow userspace to pin arbitrary * amounts of buffers through this interface. * * Buffers must be unpinned by calling dma_buf_unpin(). * * Returns: * 0 on success, negative error code on failure. */ int dma_buf_pin(struct dma_buf_attachment *attach) { struct dma_buf *dmabuf = attach->dmabuf; int ret = 0; WARN_ON(!dma_buf_attachment_is_dynamic(attach)); dma_resv_assert_held(dmabuf->resv); if (dmabuf->ops->pin) ret = dmabuf->ops->pin(attach); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_pin, DMA_BUF); /** * dma_buf_unpin - Unpin a DMA-buf * @attach: [in] attachment which should be unpinned * * This unpins a buffer pinned by dma_buf_pin() and allows the exporter to move * any mapping of @attach again and inform the importer through * &dma_buf_attach_ops.move_notify. */ void dma_buf_unpin(struct dma_buf_attachment *attach) { struct dma_buf *dmabuf = attach->dmabuf; WARN_ON(!dma_buf_attachment_is_dynamic(attach)); dma_resv_assert_held(dmabuf->resv); if (dmabuf->ops->unpin) dmabuf->ops->unpin(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_unpin, DMA_BUF); /** * dma_buf_map_attachment - Returns the scatterlist table of the attachment; * mapped into _device_ address space. Is a wrapper for map_dma_buf() of the * dma_buf_ops. * @attach: [in] attachment whose scatterlist is to be returned * @direction: [in] direction of DMA transfer * * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR * on error. May return -EINTR if it is interrupted by a signal. * * On success, the DMA addresses and lengths in the returned scatterlist are * PAGE_SIZE aligned. * * A mapping must be unmapped by using dma_buf_unmap_attachment(). Note that * the underlying backing storage is pinned for as long as a mapping exists, * therefore users/importers should not hold onto a mapping for undue amounts of * time. * * Important: Dynamic importers must wait for the exclusive fence of the struct * dma_resv attached to the DMA-BUF first. */ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, enum dma_data_direction direction) { struct sg_table *sg_table; int r; might_sleep(); if (WARN_ON(!attach || !attach->dmabuf)) return ERR_PTR(-EINVAL); dma_resv_assert_held(attach->dmabuf->resv); if (attach->sgt) { /* * Two mappings with different directions for the same * attachment are not allowed. */ if (attach->dir != direction && attach->dir != DMA_BIDIRECTIONAL) return ERR_PTR(-EBUSY); return attach->sgt; } if (dma_buf_is_dynamic(attach->dmabuf)) { if (!IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) { r = attach->dmabuf->ops->pin(attach); if (r) return ERR_PTR(r); } } sg_table = __map_dma_buf(attach, direction); if (!sg_table) sg_table = ERR_PTR(-ENOMEM); if (IS_ERR(sg_table) && dma_buf_is_dynamic(attach->dmabuf) && !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) attach->dmabuf->ops->unpin(attach); if (!IS_ERR(sg_table) && attach->dmabuf->ops->cache_sgt_mapping) { attach->sgt = sg_table; attach->dir = direction; } #ifdef CONFIG_DMA_API_DEBUG if (!IS_ERR(sg_table)) { struct scatterlist *sg; u64 addr; int len; int i; for_each_sgtable_dma_sg(sg_table, sg, i) { addr = sg_dma_address(sg); len = sg_dma_len(sg); if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(len)) { pr_debug("%s: addr %llx or len %x is not page aligned!\n", __func__, addr, len); } } } #endif /* CONFIG_DMA_API_DEBUG */ return sg_table; } EXPORT_SYMBOL_NS_GPL(dma_buf_map_attachment, DMA_BUF); /** * dma_buf_map_attachment_unlocked - Returns the scatterlist table of the attachment; * mapped into _device_ address space. Is a wrapper for map_dma_buf() of the * dma_buf_ops. * @attach: [in] attachment whose scatterlist is to be returned * @direction: [in] direction of DMA transfer * * Unlocked variant of dma_buf_map_attachment(). */ struct sg_table * dma_buf_map_attachment_unlocked(struct dma_buf_attachment *attach, enum dma_data_direction direction) { struct sg_table *sg_table; might_sleep(); if (WARN_ON(!attach || !attach->dmabuf)) return ERR_PTR(-EINVAL); dma_resv_lock(attach->dmabuf->resv, NULL); sg_table = dma_buf_map_attachment(attach, direction); dma_resv_unlock(attach->dmabuf->resv); return sg_table; } EXPORT_SYMBOL_NS_GPL(dma_buf_map_attachment_unlocked, DMA_BUF); /** * dma_buf_unmap_attachment - unmaps and decreases usecount of the buffer;might * deallocate the scatterlist associated. Is a wrapper for unmap_dma_buf() of * dma_buf_ops. * @attach: [in] attachment to unmap buffer from * @sg_table: [in] scatterlist info of the buffer to unmap * @direction: [in] direction of DMA transfer * * This unmaps a DMA mapping for @attached obtained by dma_buf_map_attachment(). */ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach, struct sg_table *sg_table, enum dma_data_direction direction) { might_sleep(); if (WARN_ON(!attach || !attach->dmabuf || !sg_table)) return; dma_resv_assert_held(attach->dmabuf->resv); if (attach->sgt == sg_table) return; __unmap_dma_buf(attach, sg_table, direction); if (dma_buf_is_dynamic(attach->dmabuf) && !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) dma_buf_unpin(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment, DMA_BUF); /** * dma_buf_unmap_attachment_unlocked - unmaps and decreases usecount of the buffer;might * deallocate the scatterlist associated. Is a wrapper for unmap_dma_buf() of * dma_buf_ops. * @attach: [in] attachment to unmap buffer from * @sg_table: [in] scatterlist info of the buffer to unmap * @direction: [in] direction of DMA transfer * * Unlocked variant of dma_buf_unmap_attachment(). */ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, struct sg_table *sg_table, enum dma_data_direction direction) { might_sleep(); if (WARN_ON(!attach || !attach->dmabuf || !sg_table)) return; dma_resv_lock(attach->dmabuf->resv, NULL); dma_buf_unmap_attachment(attach, sg_table, direction); dma_resv_unlock(attach->dmabuf->resv); } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, DMA_BUF); /** * dma_buf_move_notify - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their * mappings. */ void dma_buf_move_notify(struct dma_buf *dmabuf) { struct dma_buf_attachment *attach; dma_resv_assert_held(dmabuf->resv); list_for_each_entry(attach, &dmabuf->attachments, node) if (attach->importer_ops) attach->importer_ops->move_notify(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_move_notify, DMA_BUF); /** * DOC: cpu access * * There are multiple reasons for supporting CPU access to a dma buffer object: * * - Fallback operations in the kernel, for example when a device is connected * over USB and the kernel needs to shuffle the data around first before * sending it away. Cache coherency is handled by bracketing any transactions * with calls to dma_buf_begin_cpu_access() and dma_buf_end_cpu_access() * access. * * Since for most kernel internal dma-buf accesses need the entire buffer, a * vmap interface is introduced. Note that on very old 32-bit architectures * vmalloc space might be limited and result in vmap calls failing. * * Interfaces: * * .. code-block:: c * * void *dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map) * void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map) * * The vmap call can fail if there is no vmap support in the exporter, or if * it runs out of vmalloc space. Note that the dma-buf layer keeps a reference * count for all vmap access and calls down into the exporter's vmap function * only when no vmapping exists, and only unmaps it once. Protection against * concurrent vmap/vunmap calls is provided by taking the &dma_buf.lock mutex. * * - For full compatibility on the importer side with existing userspace * interfaces, which might already support mmap'ing buffers. This is needed in * many processing pipelines (e.g. feeding a software rendered image into a * hardware pipeline, thumbnail creation, snapshots, ...). Also, Android's ION * framework already supported this and for DMA buffer file descriptors to * replace ION buffers mmap support was needed. * * There is no special interfaces, userspace simply calls mmap on the dma-buf * fd. But like for CPU access there's a need to bracket the actual access, * which is handled by the ioctl (DMA_BUF_IOCTL_SYNC). Note that * DMA_BUF_IOCTL_SYNC can fail with -EAGAIN or -EINTR, in which case it must * be restarted. * * Some systems might need some sort of cache coherency management e.g. when * CPU and GPU domains are being accessed through dma-buf at the same time. * To circumvent this problem there are begin/end coherency markers, that * forward directly to existing dma-buf device drivers vfunc hooks. Userspace * can make use of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The * sequence would be used like following: * * - mmap dma-buf fd * - for each drawing/upload cycle in CPU 1. SYNC_START ioctl, 2. read/write * to mmap area 3. SYNC_END ioctl. This can be repeated as often as you * want (with the new data being consumed by say the GPU or the scanout * device) * - munmap once you don't need the buffer any more * * For correctness and optimal performance, it is always required to use * SYNC_START and SYNC_END before and after, respectively, when accessing the * mapped address. Userspace cannot rely on coherent access, even when there * are systems where it just works without calling these ioctls. * * - And as a CPU fallback in userspace processing pipelines. * * Similar to the motivation for kernel cpu access it is again important that * the userspace code of a given importing subsystem can use the same * interfaces with a imported dma-buf buffer object as with a native buffer * object. This is especially important for drm where the userspace part of * contemporary OpenGL, X, and other drivers is huge, and reworking them to * use a different way to mmap a buffer rather invasive. * * The assumption in the current dma-buf interfaces is that redirecting the * initial mmap is all that's needed. A survey of some of the existing * subsystems shows that no driver seems to do any nefarious thing like * syncing up with outstanding asynchronous processing on the device or * allocating special resources at fault time. So hopefully this is good * enough, since adding interfaces to intercept pagefaults and allow pte * shootdowns would increase the complexity quite a bit. * * Interface: * * .. code-block:: c * * int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *, unsigned long); * * If the importing subsystem simply provides a special-purpose mmap call to * set up a mapping in userspace, calling do_mmap with &dma_buf.file will * equally achieve that for a dma-buf object. */ static int __dma_buf_begin_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { bool write = (direction == DMA_BIDIRECTIONAL || direction == DMA_TO_DEVICE); struct dma_resv *resv = dmabuf->resv; long ret; /* Wait on any implicit rendering fences */ ret = dma_resv_wait_timeout(resv, dma_resv_usage_rw(write), true, MAX_SCHEDULE_TIMEOUT); if (ret < 0) return ret; return 0; } /** * dma_buf_begin_cpu_access - Must be called before accessing a dma_buf from the * cpu in the kernel context. Calls begin_cpu_access to allow exporter-specific * preparations. Coherency is only guaranteed in the specified range for the * specified access direction. * @dmabuf: [in] buffer to prepare cpu access for. * @direction: [in] direction of access. * * After the cpu access is complete the caller should call * dma_buf_end_cpu_access(). Only when cpu access is bracketed by both calls is * it guaranteed to be coherent with other DMA access. * * This function will also wait for any DMA transactions tracked through * implicit synchronization in &dma_buf.resv. For DMA transactions with explicit * synchronization this function will only ensure cache coherency, callers must * ensure synchronization with such DMA transactions on their own. * * Can return negative error values, returns 0 on success. */ int dma_buf_begin_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { int ret = 0; if (WARN_ON(!dmabuf)) return -EINVAL; might_lock(&dmabuf->resv->lock.base); if (dmabuf->ops->begin_cpu_access) ret = dmabuf->ops->begin_cpu_access(dmabuf, direction); /* Ensure that all fences are waited upon - but we first allow * the native handler the chance to do so more efficiently if it * chooses. A double invocation here will be reasonably cheap no-op. */ if (ret == 0) ret = __dma_buf_begin_cpu_access(dmabuf, direction); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_begin_cpu_access, DMA_BUF); /** * dma_buf_end_cpu_access - Must be called after accessing a dma_buf from the * cpu in the kernel context. Calls end_cpu_access to allow exporter-specific * actions. Coherency is only guaranteed in the specified range for the * specified access direction. * @dmabuf: [in] buffer to complete cpu access for. * @direction: [in] direction of access. * * This terminates CPU access started with dma_buf_begin_cpu_access(). * * Can return negative error values, returns 0 on success. */ int dma_buf_end_cpu_access(struct dma_buf *dmabuf, enum dma_data_direction direction) { int ret = 0; WARN_ON(!dmabuf); might_lock(&dmabuf->resv->lock.base); if (dmabuf->ops->end_cpu_access) ret = dmabuf->ops->end_cpu_access(dmabuf, direction); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_end_cpu_access, DMA_BUF); /** * dma_buf_mmap - Setup up a userspace mmap with the given vma * @dmabuf: [in] buffer that should back the vma * @vma: [in] vma for the mmap * @pgoff: [in] offset in pages where this mmap should start within the * dma-buf buffer. * * This function adjusts the passed in vma so that it points at the file of the * dma_buf operation. It also adjusts the starting pgoff and does bounds * checking on the size of the vma. Then it calls the exporters mmap function to * set up the mapping. * * Can return negative error values, returns 0 on success. */ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma, unsigned long pgoff) { if (WARN_ON(!dmabuf || !vma)) return -EINVAL; /* check if buffer supports mmap */ if (!dmabuf->ops->mmap) return -EINVAL; /* check for offset overflow */ if (pgoff + vma_pages(vma) < pgoff) return -EOVERFLOW; /* check for overflowing the buffer's size */ if (pgoff + vma_pages(vma) > dmabuf->size >> PAGE_SHIFT) return -EINVAL; /* readjust the vma */ vma_set_file(vma, dmabuf->file); vma->vm_pgoff = pgoff; return dmabuf->ops->mmap(dmabuf, vma); } EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF); /** * dma_buf_vmap - Create virtual mapping for the buffer object into kernel * address space. Same restrictions as for vmap and friends apply. * @dmabuf: [in] buffer to vmap * @map: [out] returns the vmap pointer * * This call may fail due to lack of virtual mapping address space. * These calls are optional in drivers. The intended use for them * is for mapping objects linear in kernel space for high use objects. * * To ensure coherency users must call dma_buf_begin_cpu_access() and * dma_buf_end_cpu_access() around any cpu access performed through this * mapping. * * Returns 0 on success, or a negative errno code otherwise. */ int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map) { struct iosys_map ptr; int ret; iosys_map_clear(map); if (WARN_ON(!dmabuf)) return -EINVAL; dma_resv_assert_held(dmabuf->resv); if (!dmabuf->ops->vmap) return -EINVAL; if (dmabuf->vmapping_counter) { dmabuf->vmapping_counter++; BUG_ON(iosys_map_is_null(&dmabuf->vmap_ptr)); *map = dmabuf->vmap_ptr; return 0; } BUG_ON(iosys_map_is_set(&dmabuf->vmap_ptr)); ret = dmabuf->ops->vmap(dmabuf, &ptr); if (WARN_ON_ONCE(ret)) return ret; dmabuf->vmap_ptr = ptr; dmabuf->vmapping_counter = 1; *map = dmabuf->vmap_ptr; return 0; } EXPORT_SYMBOL_NS_GPL(dma_buf_vmap, DMA_BUF); /** * dma_buf_vmap_unlocked - Create virtual mapping for the buffer object into kernel * address space. Same restrictions as for vmap and friends apply. * @dmabuf: [in] buffer to vmap * @map: [out] returns the vmap pointer * * Unlocked version of dma_buf_vmap() * * Returns 0 on success, or a negative errno code otherwise. */ int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map) { int ret; iosys_map_clear(map); if (WARN_ON(!dmabuf)) return -EINVAL; dma_resv_lock(dmabuf->resv, NULL); ret = dma_buf_vmap(dmabuf, map); dma_resv_unlock(dmabuf->resv); return ret; } EXPORT_SYMBOL_NS_GPL(dma_buf_vmap_unlocked, DMA_BUF); /** * dma_buf_vunmap - Unmap a vmap obtained by dma_buf_vmap. * @dmabuf: [in] buffer to vunmap * @map: [in] vmap pointer to vunmap */ void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map) { if (WARN_ON(!dmabuf)) return; dma_resv_assert_held(dmabuf->resv); BUG_ON(iosys_map_is_null(&dmabuf->vmap_ptr)); BUG_ON(dmabuf->vmapping_counter == 0); BUG_ON(!iosys_map_is_equal(&dmabuf->vmap_ptr, map)); if (--dmabuf->vmapping_counter == 0) { if (dmabuf->ops->vunmap) dmabuf->ops->vunmap(dmabuf, map); iosys_map_clear(&dmabuf->vmap_ptr); } } EXPORT_SYMBOL_NS_GPL(dma_buf_vunmap, DMA_BUF); /** * dma_buf_vunmap_unlocked - Unmap a vmap obtained by dma_buf_vmap. * @dmabuf: [in] buffer to vunmap * @map: [in] vmap pointer to vunmap */ void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map) { if (WARN_ON(!dmabuf)) return; dma_resv_lock(dmabuf->resv, NULL); dma_buf_vunmap(dmabuf, map); dma_resv_unlock(dmabuf->resv); } EXPORT_SYMBOL_NS_GPL(dma_buf_vunmap_unlocked, DMA_BUF); #ifdef CONFIG_DEBUG_FS static int dma_buf_debug_show(struct seq_file *s, void *unused) { struct dma_buf *buf_obj; struct dma_buf_attachment *attach_obj; int count = 0, attach_count; size_t size = 0; int ret; ret = mutex_lock_interruptible(&debugfs_list_mutex); if (ret) return ret; seq_puts(s, "\nDma-buf Objects:\n"); seq_printf(s, "%-8s\t%-8s\t%-8s\t%-8s\texp_name\t%-8s\tname\n", "size", "flags", "mode", "count", "ino"); list_for_each_entry(buf_obj, &debugfs_list, list_node) { ret = dma_resv_lock_interruptible(buf_obj->resv, NULL); if (ret) goto error_unlock; spin_lock(&buf_obj->name_lock); seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, file_count(buf_obj->file), buf_obj->exp_name, file_inode(buf_obj->file)->i_ino, buf_obj->name ?: "<none>"); spin_unlock(&buf_obj->name_lock); dma_resv_describe(buf_obj->resv, s); seq_puts(s, "\tAttached Devices:\n"); attach_count = 0; list_for_each_entry(attach_obj, &buf_obj->attachments, node) { seq_printf(s, "\t%s\n", dev_name(attach_obj->dev)); attach_count++; } dma_resv_unlock(buf_obj->resv); seq_printf(s, "Total %d devices attached\n\n", attach_count); count++; size += buf_obj->size; } seq_printf(s, "\nTotal %d objects, %zu bytes\n", count, size); mutex_unlock(&debugfs_list_mutex); return 0; error_unlock: mutex_unlock(&debugfs_list_mutex); return ret; } DEFINE_SHOW_ATTRIBUTE(dma_buf_debug); static struct dentry *dma_buf_debugfs_dir; static int dma_buf_init_debugfs(void) { struct dentry *d; int err = 0; d = debugfs_create_dir("dma_buf", NULL); if (IS_ERR(d)) return PTR_ERR(d); dma_buf_debugfs_dir = d; d = debugfs_create_file("bufinfo", 0444, dma_buf_debugfs_dir, NULL, &dma_buf_debug_fops); if (IS_ERR(d)) { pr_debug("dma_buf: debugfs: failed to create node bufinfo\n"); debugfs_remove_recursive(dma_buf_debugfs_dir); dma_buf_debugfs_dir = NULL; err = PTR_ERR(d); } return err; } static void dma_buf_uninit_debugfs(void) { debugfs_remove_recursive(dma_buf_debugfs_dir); } #else static inline int dma_buf_init_debugfs(void) { return 0; } static inline void dma_buf_uninit_debugfs(void) { } #endif static int __init dma_buf_init(void) { int ret; ret = dma_buf_init_sysfs_statistics(); if (ret) return ret; dma_buf_mnt = kern_mount(&dma_buf_fs_type); if (IS_ERR(dma_buf_mnt)) return PTR_ERR(dma_buf_mnt); dma_buf_init_debugfs(); return 0; } subsys_initcall(dma_buf_init); static void __exit dma_buf_deinit(void) { dma_buf_uninit_debugfs(); kern_unmount(dma_buf_mnt); dma_buf_uninit_sysfs_statistics(); } __exitcall(dma_buf_deinit);
62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0+ /* * linux/net/sunrpc/gss_rpc_upcall.c * * Copyright (C) 2012 Simo Sorce <simo@redhat.com> */ #include <linux/types.h> #include <linux/un.h> #include <linux/sunrpc/svcauth.h> #include "gss_rpc_upcall.h" #define GSSPROXY_SOCK_PATHNAME "/var/run/gssproxy.sock" #define GSSPROXY_PROGRAM (400112u) #define GSSPROXY_VERS_1 (1u) /* * Encoding/Decoding functions */ enum { GSSX_NULL = 0, /* Unused */ GSSX_INDICATE_MECHS = 1, GSSX_GET_CALL_CONTEXT = 2, GSSX_IMPORT_AND_CANON_NAME = 3, GSSX_EXPORT_CRED = 4, GSSX_IMPORT_CRED = 5, GSSX_ACQUIRE_CRED = 6, GSSX_STORE_CRED = 7, GSSX_INIT_SEC_CONTEXT = 8, GSSX_ACCEPT_SEC_CONTEXT = 9, GSSX_RELEASE_HANDLE = 10, GSSX_GET_MIC = 11, GSSX_VERIFY = 12, GSSX_WRAP = 13, GSSX_UNWRAP = 14, GSSX_WRAP_SIZE_LIMIT = 15, }; #define PROC(proc, name) \ [GSSX_##proc] = { \ .p_proc = GSSX_##proc, \ .p_encode = gssx_enc_##name, \ .p_decode = gssx_dec_##name, \ .p_arglen = GSSX_ARG_##name##_sz, \ .p_replen = GSSX_RES_##name##_sz, \ .p_statidx = GSSX_##proc, \ .p_name = #proc, \ } static const struct rpc_procinfo gssp_procedures[] = { PROC(INDICATE_MECHS, indicate_mechs), PROC(GET_CALL_CONTEXT, get_call_context), PROC(IMPORT_AND_CANON_NAME, import_and_canon_name), PROC(EXPORT_CRED, export_cred), PROC(IMPORT_CRED, import_cred), PROC(ACQUIRE_CRED, acquire_cred), PROC(STORE_CRED, store_cred), PROC(INIT_SEC_CONTEXT, init_sec_context), PROC(ACCEPT_SEC_CONTEXT, accept_sec_context), PROC(RELEASE_HANDLE, release_handle), PROC(GET_MIC, get_mic), PROC(VERIFY, verify), PROC(WRAP, wrap), PROC(UNWRAP, unwrap), PROC(WRAP_SIZE_LIMIT, wrap_size_limit), }; /* * Common transport functions */ static const struct rpc_program gssp_program; static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) { static const struct sockaddr_un gssp_localaddr = { .sun_family = AF_LOCAL, .sun_path = GSSPROXY_SOCK_PATHNAME, }; struct rpc_create_args args = { .net = net, .protocol = XPRT_TRANSPORT_LOCAL, .address = (struct sockaddr *)&gssp_localaddr, .addrsize = sizeof(gssp_localaddr), .servername = "localhost", .program = &gssp_program, .version = GSSPROXY_VERS_1, .authflavor = RPC_AUTH_NULL, /* * Note we want connection to be done in the caller's * filesystem namespace. We therefore turn off the idle * timeout, which would result in reconnections being * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; int result = 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("RPC: failed to create AF_LOCAL gssproxy " "client (errno %ld).\n", PTR_ERR(clnt)); result = PTR_ERR(clnt); *_clnt = NULL; goto out; } dprintk("RPC: created new gssp local client (gssp_local_clnt: " "%p)\n", clnt); *_clnt = clnt; out: return result; } void init_gssp_clnt(struct sunrpc_net *sn) { mutex_init(&sn->gssp_lock); sn->gssp_clnt = NULL; } int set_gssp_clnt(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int ret; mutex_lock(&sn->gssp_lock); ret = gssp_rpc_create(net, &clnt); if (!ret) { if (sn->gssp_clnt) rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = clnt; } mutex_unlock(&sn->gssp_lock); return ret; } void clear_gssp_clnt(struct sunrpc_net *sn) { mutex_lock(&sn->gssp_lock); if (sn->gssp_clnt) { rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = NULL; } mutex_unlock(&sn->gssp_lock); } static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) { struct rpc_clnt *clnt; mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } static int gssp_call(struct net *net, struct rpc_message *msg) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int status; clnt = get_gssp_clnt(sn); if (!clnt) return -EIO; status = rpc_call_sync(clnt, msg, 0); if (status < 0) { dprintk("gssp: rpc_call returned error %d\n", -status); switch (status) { case -EPROTONOSUPPORT: status = -EINVAL; break; case -ECONNREFUSED: case -ETIMEDOUT: case -ENOTCONN: status = -EAGAIN; break; case -ERESTARTSYS: if (signalled ()) status = -EINTR; break; default: break; } } rpc_release_client(clnt); return status; } static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); kfree(arg->pages); } static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); if (!arg->pages) return -ENOMEM; for (i = 0; i < arg->npages; i++) { arg->pages[i] = alloc_page(GFP_KERNEL); if (!arg->pages[i]) { gssp_free_receive_pages(arg); return -ENOMEM; } } return 0; } static char *gssp_stringify(struct xdr_netobj *netobj) { return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL); } static void gssp_hostbased_service(char **principal) { char *c; if (!*principal) return; /* terminate and remove realm part */ c = strchr(*principal, '@'); if (c) { *c = '\0'; /* change service-hostname delimiter */ c = strchr(*principal, '/'); if (c) *c = '@'; } if (!c) { /* not a service principal */ kfree(*principal); *principal = NULL; } } /* * Public functions */ /* numbers somewhat arbitrary but large enough for current needs */ #define GSSX_MAX_OUT_HANDLE 128 #define GSSX_MAX_SRC_PRINC 256 #define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \ GSSX_max_oid_sz + \ GSSX_max_princ_sz + \ sizeof(struct svc_cred)) int gssp_accept_sec_context_upcall(struct net *net, struct gssp_upcall_data *data) { struct gssx_ctx ctxh = { .state = data->in_handle }; struct gssx_arg_accept_sec_context arg = { .input_token = data->in_token, }; struct gssx_ctx rctxh = { /* * pass in the max length we expect for each of these * buffers but let the xdr code kmalloc them: */ .exported_context_token.len = GSSX_max_output_handle_sz, .mech.len = GSS_OID_MAX_LEN, .targ_name.display_name.len = GSSX_max_princ_sz, .src_name.display_name.len = GSSX_max_princ_sz }; struct gssx_res_accept_sec_context res = { .context_handle = &rctxh, .output_token = &data->out_token }; struct rpc_message msg = { .rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT], .rpc_argp = &arg, .rpc_resp = &res, .rpc_cred = NULL, /* FIXME ? */ }; struct xdr_netobj client_name = { 0 , NULL }; struct xdr_netobj target_name = { 0, NULL }; int ret; if (data->in_handle.len != 0) arg.context_handle = &ctxh; res.output_token->len = GSSX_max_output_token_sz; ret = gssp_alloc_receive_pages(&arg); if (ret) return ret; ret = gssp_call(net, &msg); gssp_free_receive_pages(&arg); /* we need to fetch all data even in case of error so * that we can free special strctures is they have been allocated */ data->major_status = res.status.major_status; data->minor_status = res.status.minor_status; if (res.context_handle) { data->out_handle = rctxh.exported_context_token; data->mech_oid.len = rctxh.mech.len; if (rctxh.mech.data) { memcpy(data->mech_oid.data, rctxh.mech.data, data->mech_oid.len); kfree(rctxh.mech.data); } client_name = rctxh.src_name.display_name; target_name = rctxh.targ_name.display_name; } if (res.options.count == 1) { gssx_buffer *value = &res.options.data[0].value; /* Currently we only decode CREDS_VALUE, if we add * anything else we'll have to loop and match on the * option name */ if (value->len == 1) { /* steal group info from struct svc_cred */ data->creds = *(struct svc_cred *)value->data; data->found_creds = 1; } /* whether we use it or not, free data */ kfree(value->data); } if (res.options.count != 0) { kfree(res.options.data); } /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */ if (data->found_creds) { if (client_name.data) { data->creds.cr_raw_principal = gssp_stringify(&client_name); data->creds.cr_principal = gssp_stringify(&client_name); gssp_hostbased_service(&data->creds.cr_principal); } if (target_name.data) { data->creds.cr_targ_princ = gssp_stringify(&target_name); gssp_hostbased_service(&data->creds.cr_targ_princ); } } kfree(client_name.data); kfree(target_name.data); return ret; } void gssp_free_upcall_data(struct gssp_upcall_data *data) { kfree(data->in_handle.data); kfree(data->out_handle.data); kfree(data->out_token.data); free_svc_cred(&data->creds); } /* * Initialization stuff */ static unsigned int gssp_version1_counts[ARRAY_SIZE(gssp_procedures)]; static const struct rpc_version gssp_version1 = { .number = GSSPROXY_VERS_1, .nrprocs = ARRAY_SIZE(gssp_procedures), .procs = gssp_procedures, .counts = gssp_version1_counts, }; static const struct rpc_version *gssp_version[] = { NULL, &gssp_version1, }; static struct rpc_stat gssp_stats; static const struct rpc_program gssp_program = { .name = "gssproxy", .number = GSSPROXY_PROGRAM, .nrvers = ARRAY_SIZE(gssp_version), .version = gssp_version, .stats = &gssp_stats, };
193 6 191 6 2 2 2 2 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2019 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS(CAN_GW_NAME); #define CGW_MIN_HOPS 1 #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static struct notifier_block notifier; static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { struct canfd_frame and; struct canfd_frame or; struct canfd_frame xor; struct canfd_frame set; } modframe; struct { u8 and; u8 or; u8 xor; u8 set; } modtype; void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ struct { struct cgw_csum_xor xor; struct cgw_csum_crc8 crc8; } csum; struct { void (*xor)(struct canfd_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; /* So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. */ struct can_can_gw { struct can_filter filter; int src_idx; int dst_idx; }; /* list entry for CAN gateways jobs */ struct cgw_job { struct hlist_node list; struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; u32 deleted_frames; struct cf_mod mod; union { /* CAN frame data source */ struct net_device *dev; } src; union { /* CAN frame data destination */ struct net_device *dev; } dst; union { struct can_can_gw ccgw; /* tbc */ }; u8 gwtype; u8 limit_hops; u16 flags; }; /* modification functions that are invoked in the hot path in can_can_gw_rcv */ #define MODFUNC(func, op) static void func(struct canfd_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len) MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len) MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len) MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) MODFUNC(mod_set_len, cf->len = mod->modframe.set.len) MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i); } static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i); } static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod) { int i; for (i = 0; i < CANFD_MAX_DLEN; i += 8) *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i); } static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod) { memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN); } /* retrieve valid CC DLC value and store it into 'len' */ static void mod_retrieve_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* len8_dlc is only valid if len == CAN_MAX_DLEN */ if (ccf->len != CAN_MAX_DLEN) return; /* do we have a valid len8_dlc value from 9 .. 15 ? */ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC) ccf->len = ccf->len8_dlc; } /* convert valid CC DLC value in 'len' into struct can_frame elements */ static void mod_store_ccdlc(struct canfd_frame *cf) { struct can_frame *ccf = (struct can_frame *)cf; /* clear potential leftovers */ ccf->len8_dlc = 0; /* plain data length 0 .. 8 - that was easy */ if (ccf->len <= CAN_MAX_DLEN) return; /* potentially broken values are caught in can_can_gw_rcv() */ if (ccf->len > CAN_MAX_RAW_DLC) return; /* we have a valid dlc value from 9 .. 15 in ccf->len */ ccf->len8_dlc = ccf->len; ccf->len = CAN_MAX_DLEN; } static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_and_len(cf, mod); mod_store_ccdlc(cf); } static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_or_len(cf, mod); mod_store_ccdlc(cf); } static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_retrieve_ccdlc(cf); mod_xor_len(cf, mod); mod_store_ccdlc(cf); } static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod) { mod_set_len(cf, mod); mod_store_ccdlc(cf); } static void canframecpy(struct canfd_frame *dst, struct can_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->len = src->len; *(u64 *)dst->data = *(u64 *)src->data; } static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src) { /* Copy the struct members separately to ensure that no uninitialized * data are copied in the 2 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->flags = src->flags; dst->len = src->len; memcpy(dst->data, src->data, CANFD_MAX_DLEN); } static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r) { s8 dlen = CAN_MAX_DLEN; if (r->flags & CGW_FLAGS_CAN_FD) dlen = CANFD_MAX_DLEN; /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) * -3 => index = 5 (data[5]) * -8 => index = 0 (data[0]) */ if (fr >= -dlen && fr < dlen && to >= -dlen && to < dlen && re >= -dlen && re < dlen) return 0; else return -EINVAL; } static inline int calc_idx(int idx, int rx_len) { if (idx < 0) return rx_len + idx; else return idx; } static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor) { int from = calc_idx(xor->from_idx, cf->len); int to = calc_idx(xor->to_idx, cf->len); int res = calc_idx(xor->result_idx, cf->len); u8 val = xor->init_xor_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) val ^= cf->data[i]; } else { for (i = from; i >= to; i--) val ^= cf->data[i]; } cf->data[res] = val; } static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i <= xor->to_idx; i++) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i >= xor->to_idx; i--) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_crc8_rel(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { int from = calc_idx(crc8->from_idx, cf->len); int to = calc_idx(crc8->to_idx, cf->len); int res = calc_idx(crc8->result_idx, cf->len); u8 crc = crc8->init_crc_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; } else { for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; } switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } static void cgw_csum_crc8_neg(struct canfd_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc ^ cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc ^ crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ if (gwj->flags & CGW_FLAGS_CAN_FD) { if (!can_is_canfd_skb(skb)) return; } else { if (!can_is_can_skb(skb)) return; } /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). * * The Controller Area Network controllers only accept CAN frames with * correct CRCs - which are not visible in the controller registers. * According to skbuff.h documentation the csum_start element for IP * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. * Only CAN skbs can be processed here which already have this property. */ #define cgw_hops(skb) ((skb)->csum_start) BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); if (cgw_hops(skb) >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ if (gwj->mod.modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { gwj->dropped_frames++; return; } /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; /* first processing of this CAN frame -> adjust to private hop limit */ if (gwj->limit_hops && cgw_hops(nskb) == 1) cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ int max_len = nskb->len - offsetof(struct canfd_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ if (cf->len > max_len) { /* delete frame due to misconfiguration */ gwj->deleted_frames++; kfree_skb(nskb); return; } /* check for checksum updates */ if (gwj->mod.csumfunc.crc8) (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); if (gwj->mod.csumfunc.xor) (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); } /* clear the skb timestamp if not configured the other way */ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) nskb->tstamp = 0; /* send to netdevice */ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) gwj->dropped_frames++; else gwj->handled_frames++; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); kmem_cache_free(cgw_cache, gwj); } static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } } return NOTIFY_DONE; } static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) return -EMSGSIZE; rtcan = nlmsg_data(nlh); rtcan->can_family = AF_CAN; rtcan->gwtype = gwj->gwtype; rtcan->flags = gwj->flags; /* add statistics if available */ if (gwj->handled_frames) { if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0) goto cancel; } if (gwj->dropped_frames) { if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0) goto cancel; } if (gwj->deleted_frames) { if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) goto cancel; } /* check non default settings of attributes */ if (gwj->limit_hops) { if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) goto cancel; } if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } if (gwj->mod.uid) { if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) goto cancel; } if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) goto cancel; } if (gwj->mod.csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, &gwj->mod.csum.xor) < 0) goto cancel; } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0) goto cancel; if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0) goto cancel; } nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static const struct nla_policy cgw_policy[CGW_MAX + 1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) }, [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) }, [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) }, [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX + 1]; struct rtcanmsg *r = nlmsg_data(nlh); int modidx = 0; int err = 0; /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, cgw_policy, NULL); if (err < 0) return err; if (tb[CGW_LIM_HOPS]) { *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); if (*limhops < 1 || *limhops > max_hops) return -EINVAL; } /* check for AND/OR/XOR/SET modifications */ if (r->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; if (tb[CGW_FDMOD_AND]) { nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_and_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_and_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_fddata; } if (tb[CGW_FDMOD_OR]) { nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_or_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_or_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_fddata; } if (tb[CGW_FDMOD_XOR]) { nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_xor_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_xor_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_fddata; } if (tb[CGW_FDMOD_SET]) { nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN); canfdframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_LEN) mod->modfunc[modidx++] = mod_set_len; if (mb.modtype & CGW_MOD_FLAGS) mod->modfunc[modidx++] = mod_set_flags; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_fddata; } } else { struct cgw_frame_mod mb; if (tb[CGW_MOD_AND]) { nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); canframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_and_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; } if (tb[CGW_MOD_OR]) { nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_or_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; } if (tb[CGW_MOD_XOR]) { nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_xor_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; } if (tb[CGW_MOD_SET]) { nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); canframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_set_ccdlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; } } /* check for checksum operations after CAN frame modifications */ if (modidx) { if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.crc8 = cgw_csum_crc8_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.crc8 = cgw_csum_crc8_pos; else mod->csumfunc.crc8 = cgw_csum_crc8_neg; } if (tb[CGW_CS_XOR]) { struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx, r); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); /* select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.xor = cgw_csum_xor_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.xor = cgw_csum_xor_pos; else mod->csumfunc.xor = cgw_csum_xor_neg; } if (tb[CGW_MOD_UID]) nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); } if (gwtype == CGW_TYPE_CAN_CAN) { /* check CGW_TYPE_CAN_CAN specific attributes */ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ if (tb[CGW_FILTER]) nla_memcpy(&ccgw->filter, tb[CGW_FILTER], sizeof(struct can_filter)); err = -ENODEV; /* specifying two interfaces is mandatory */ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF]) return err; ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]); ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]); /* both indices set to 0 for flushing all routing entries */ if (!ccgw->src_idx && !ccgw->dst_idx) return 0; /* only one index set to 0 is an error */ if (!ccgw->src_idx || !ccgw->dst_idx) return err; } /* add the checks for other gwtypes here */ return 0; } static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; if (mod.uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { if (gwj->mod.uid != mod.uid) continue; /* interfaces & filters must be identical */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) return -EINVAL; /* update modifications with disabled softirq & quit */ local_bh_disable(); memcpy(&gwj->mod, &mod, sizeof(mod)); local_bh_enable(); return 0; } } /* ifindex == 0 is not allowed for job creation */ if (!ccgw.src_idx || !ccgw.dst_idx) return -ENODEV; gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; gwj->handled_frames = 0; gwj->dropped_frames = 0; gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; gwj->limit_hops = limhops; /* insert already parsed information */ memcpy(&gwj->mod, &mod, sizeof(mod)); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; if (gwj->src.dev->type != ARPHRD_CAN) goto out; gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; if (gwj->dst.dev->type != ARPHRD_CAN) goto out; /* is sending the skb back to the incoming interface intended? */ if (gwj->src.dev == gwj->dst.dev && !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) { err = -EINVAL; goto out; } ASSERT_RTNL(); err = cgw_register_filter(net, gwj); if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) kmem_cache_free(cgw_cache, gwj); return err; } static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); } } static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { cgw_remove_all_jobs(net); return 0; } err = -EINVAL; ASSERT_RTNL(); /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; /* we have a match when uid is enabled and identical */ if (gwj->mod.uid || mod.uid) { if (gwj->mod.uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ if (memcmp(&gwj->mod, &mod, sizeof(mod))) continue; } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) continue; hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); call_rcu(&gwj->rcu, cgw_job_free_rcu); err = 0; break; } return err; } static int __net_init cangw_pernet_init(struct net *net) { INIT_HLIST_HEAD(&net->can.cgw_list); return 0; } static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit_batch = cangw_pernet_exit_batch, }; static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE, .doit = cgw_create_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE, .doit = cgw_remove_job}, {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE, .dumpit = cgw_dump_jobs}, }; static __init int cgw_module_init(void) { int ret; /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); pr_info("can: netlink gateway - max_hops=%d\n", max_hops); ret = register_pernet_subsys(&cangw_pernet_ops); if (ret) return ret; ret = -ENOMEM; cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); if (!cgw_cache) goto out_cache_create; /* set notifier */ notifier.notifier_call = cgw_notifier; ret = register_netdevice_notifier(&notifier); if (ret) goto out_register_notifier; ret = rtnl_register_many(cgw_rtnl_msg_handlers); if (ret) goto out_rtnl_register; return 0; out_rtnl_register: unregister_netdevice_notifier(&notifier); out_register_notifier: kmem_cache_destroy(cgw_cache); out_cache_create: unregister_pernet_subsys(&cangw_pernet_ops); return ret; } static __exit void cgw_module_exit(void) { rtnl_unregister_all(PF_CAN); unregister_netdevice_notifier(&notifier); unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); } module_init(cgw_module_init); module_exit(cgw_module_exit);
22 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mmap #if !defined(_TRACE_MMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_H #include <linux/tracepoint.h> TRACE_EVENT(vm_unmapped_area, TP_PROTO(unsigned long addr, struct vm_unmapped_area_info *info), TP_ARGS(addr, info), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, total_vm) __field(unsigned long, flags) __field(unsigned long, length) __field(unsigned long, low_limit) __field(unsigned long, high_limit) __field(unsigned long, align_mask) __field(unsigned long, align_offset) ), TP_fast_assign( __entry->addr = addr; __entry->total_vm = current->mm->total_vm; __entry->flags = info->flags; __entry->length = info->length; __entry->low_limit = info->low_limit; __entry->high_limit = info->high_limit; __entry->align_mask = info->align_mask; __entry->align_offset = info->align_offset; ), TP_printk("addr=0x%lx err=%ld total_vm=0x%lx flags=0x%lx len=0x%lx lo=0x%lx hi=0x%lx mask=0x%lx ofs=0x%lx", IS_ERR_VALUE(__entry->addr) ? 0 : __entry->addr, IS_ERR_VALUE(__entry->addr) ? __entry->addr : 0, __entry->total_vm, __entry->flags, __entry->length, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); TRACE_EVENT(vma_mas_szero, TP_PROTO(struct maple_tree *mt, unsigned long start, unsigned long end), TP_ARGS(mt, start, end), TP_STRUCT__entry( __field(struct maple_tree *, mt) __field(unsigned long, start) __field(unsigned long, end) ), TP_fast_assign( __entry->mt = mt; __entry->start = start; __entry->end = end; ), TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,", __entry->mt, (unsigned long) __entry->start, (unsigned long) __entry->end ) ); TRACE_EVENT(vma_store, TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma), TP_ARGS(mt, vma), TP_STRUCT__entry( __field(struct maple_tree *, mt) __field(struct vm_area_struct *, vma) __field(unsigned long, vm_start) __field(unsigned long, vm_end) ), TP_fast_assign( __entry->mt = mt; __entry->vma = vma; __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end - 1; ), TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,", __entry->mt, __entry->vma, (unsigned long) __entry->vm_start, (unsigned long) __entry->vm_end ) ); TRACE_EVENT(exit_mmap, TP_PROTO(struct mm_struct *mm), TP_ARGS(mm), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(struct maple_tree *, mt) ), TP_fast_assign( __entry->mm = mm; __entry->mt = &mm->mm_mt; ), TP_printk("mt_mod %p, DESTROY", __entry->mt ) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
14 14 14 14 14 2 14 14 19 19 19 19 19 19 6 13 12 6 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/switchdev.h> #include "br_private.h" static struct static_key_false br_switchdev_tx_fwd_offload; static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, const struct sk_buff *skb) { if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) return false; return (p->flags & BR_TX_FWD_OFFLOAD) && (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); } bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) return false; return BR_INPUT_SKB_CB(skb)->tx_fwd_offload; } void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) { skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb); } /* Mark the frame for TX forwarding offload if this egress port supports it */ void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, struct sk_buff *skb) { if (nbp_switchdev_can_offload_tx_fwd(p, skb)) BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true; } /* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms * that the skb has been already forwarded to, to avoid further cloning to * other ports in the same hwdom by making nbp_switchdev_allowed_egress() * return false. */ void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, struct sk_buff *skb) { if (nbp_switchdev_can_offload_tx_fwd(p, skb)) set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms); } void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { if (p->hwdom) BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom; } bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb); return !test_bit(p->hwdom, &cb->fwd_hwdoms) && (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom); } /* Flags that can be offloaded to hardware */ #define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | BR_PORT_MAB | \ BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \ BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST) int br_switchdev_set_port_flag(struct net_bridge_port *p, unsigned long flags, unsigned long mask, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->dev, }; struct switchdev_notifier_port_attr_info info = { .attr = &attr, }; int err; mask &= BR_PORT_FLAGS_HW_OFFLOAD; if (!mask) return 0; attr.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS; attr.u.brport_flags.val = flags; attr.u.brport_flags.mask = mask; /* We run from atomic context here */ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev, &info.info, extack); err = notifier_to_errno(err); if (err == -EOPNOTSUPP) return 0; if (err) { NL_SET_ERR_MSG_WEAK_MOD(extack, "bridge flag offload is not supported"); return -EOPNOTSUPP; } attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS; attr.flags = SWITCHDEV_F_DEFER; err = switchdev_port_attr_set(p->dev, &attr, extack); if (err) { NL_SET_ERR_MSG_WEAK_MOD(extack, "error setting offload flag on port"); return err; } return 0; } static void br_switchdev_fdb_populate(struct net_bridge *br, struct switchdev_notifier_fdb_info *item, const struct net_bridge_fdb_entry *fdb, const void *ctx) { const struct net_bridge_port *p = READ_ONCE(fdb->dst); item->addr = fdb->key.addr.addr; item->vid = fdb->key.vlan_id; item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); item->locked = false; item->info.dev = (!p || item->is_local) ? br->dev : p->dev; item->info.ctx = ctx; } void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type) { struct switchdev_notifier_fdb_info item; if (test_bit(BR_FDB_LOCKED, &fdb->flags)) return; /* Entries with these flags were created using ndm_state == NUD_REACHABLE, * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something * equivalent to 'bridge fdb add ... master dynamic (sticky)'. * Drivers don't know how to deal with these, so don't notify them to * avoid confusing them. */ if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) && !test_bit(BR_FDB_STATIC, &fdb->flags) && !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) return; br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) { case RTM_DELNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, item.info.dev, &item.info, NULL); break; case RTM_NEWNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, item.info.dev, &item.info, NULL); break; } } int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, bool changed, struct netlink_ext_ack *extack) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = flags, .vid = vid, .changed = changed, }; return switchdev_port_obj_add(dev, &v.obj, extack); } int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) { struct switchdev_obj_port_vlan v = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, }; return switchdev_port_obj_del(dev, &v.obj); } static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining) { struct net_bridge *br = joining->br; struct net_bridge_port *p; int hwdom; /* joining is yet to be added to the port list. */ list_for_each_entry(p, &br->port_list, list) { if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) { joining->hwdom = p->hwdom; return 0; } } hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1); if (hwdom >= BR_HWDOM_MAX) return -EBUSY; set_bit(hwdom, &br->busy_hwdoms); joining->hwdom = hwdom; return 0; } static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving) { struct net_bridge *br = leaving->br; struct net_bridge_port *p; /* leaving is no longer in the port list. */ list_for_each_entry(p, &br->port_list, list) { if (p->hwdom == leaving->hwdom) return; } clear_bit(leaving->hwdom, &br->busy_hwdoms); } static int nbp_switchdev_add(struct net_bridge_port *p, struct netdev_phys_item_id ppid, bool tx_fwd_offload, struct netlink_ext_ack *extack) { int err; if (p->offload_count) { /* Prevent unsupported configurations such as a bridge port * which is a bonding interface, and the member ports are from * different hardware switches. */ if (!netdev_phys_item_id_same(&p->ppid, &ppid)) { NL_SET_ERR_MSG_MOD(extack, "Same bridge port cannot be offloaded by two physical switches"); return -EBUSY; } /* Tolerate drivers that call switchdev_bridge_port_offload() * more than once for the same bridge port, such as when the * bridge port is an offloaded bonding/team interface. */ p->offload_count++; return 0; } p->ppid = ppid; p->offload_count = 1; err = nbp_switchdev_hwdom_set(p); if (err) return err; if (tx_fwd_offload) { p->flags |= BR_TX_FWD_OFFLOAD; static_branch_inc(&br_switchdev_tx_fwd_offload); } return 0; } static void nbp_switchdev_del(struct net_bridge_port *p) { if (WARN_ON(!p->offload_count)) return; p->offload_count--; if (p->offload_count) return; if (p->hwdom) nbp_switchdev_hwdom_put(p); if (p->flags & BR_TX_FWD_OFFLOAD) { p->flags &= ~BR_TX_FWD_OFFLOAD; static_branch_dec(&br_switchdev_tx_fwd_offload); } } static int br_switchdev_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb, const struct net_bridge_fdb_entry *fdb, unsigned long action, const void *ctx) { struct switchdev_notifier_fdb_info item; int err; br_switchdev_fdb_populate(br, &item, fdb, ctx); err = nb->notifier_call(nb, action, &item); return notifier_to_errno(err); } static int br_switchdev_fdb_replay(const struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb) { struct net_bridge_fdb_entry *fdb; struct net_bridge *br; unsigned long action; int err = 0; if (!nb) return 0; if (!netif_is_bridge_master(br_dev)) return -EINVAL; br = netdev_priv(br_dev); if (adding) action = SWITCHDEV_FDB_ADD_TO_DEVICE; else action = SWITCHDEV_FDB_DEL_TO_DEVICE; rcu_read_lock(); hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { err = br_switchdev_fdb_replay_one(br, nb, fdb, action, ctx); if (err) break; } rcu_read_unlock(); return err; } static int br_switchdev_vlan_attr_replay(struct net_device *br_dev, const void *ctx, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_attr_info attr_info = { .info = { .dev = br_dev, .extack = extack, .ctx = ctx, }, }; struct net_bridge *br = netdev_priv(br_dev); struct net_bridge_vlan_group *vg; struct switchdev_attr attr; struct net_bridge_vlan *v; int err; attr_info.attr = &attr; attr.orig_dev = br_dev; vg = br_vlan_group(br); if (!vg) return 0; list_for_each_entry(v, &vg->vlan_list, vlist) { if (v->msti) { attr.id = SWITCHDEV_ATTR_ID_VLAN_MSTI; attr.u.vlan_msti.vid = v->vid; attr.u.vlan_msti.msti = v->msti; err = nb->notifier_call(nb, SWITCHDEV_PORT_ATTR_SET, &attr_info); err = notifier_to_errno(err); if (err) return err; } } return 0; } static int br_switchdev_vlan_replay_one(struct notifier_block *nb, struct net_device *dev, struct switchdev_obj_port_vlan *vlan, const void *ctx, unsigned long action, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, .ctx = ctx, }, .obj = &vlan->obj, }; int err; err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } static int br_switchdev_vlan_replay_group(struct notifier_block *nb, struct net_device *dev, struct net_bridge_vlan_group *vg, const void *ctx, unsigned long action, struct netlink_ext_ack *extack) { struct net_bridge_vlan *v; int err = 0; u16 pvid; if (!vg) return 0; pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct switchdev_obj_port_vlan vlan = { .obj.orig_dev = dev, .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .flags = br_vlan_flags(v, pvid), .vid = v->vid, }; if (!br_vlan_should_use(v)) continue; err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx, action, extack); if (err) return err; } return 0; } static int br_switchdev_vlan_replay(struct net_device *br_dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(br_dev); struct net_bridge_port *p; unsigned long action; int err; ASSERT_RTNL(); if (!nb) return 0; if (!netif_is_bridge_master(br_dev)) return -EINVAL; if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br), ctx, action, extack); if (err) return err; list_for_each_entry(p, &br->port_list, list) { struct net_device *dev = p->dev; err = br_switchdev_vlan_replay_group(nb, dev, nbp_vlan_group(p), ctx, action, extack); if (err) return err; } if (adding) { err = br_switchdev_vlan_attr_replay(br_dev, ctx, nb, extack); if (err) return err; } return 0; } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct br_switchdev_mdb_complete_info { struct net_bridge_port *port; struct br_ip ip; }; static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *priv) { struct br_switchdev_mdb_complete_info *data = priv; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; if (err) goto err; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); if (!mp) goto out; for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port != port) continue; p->flags |= MDB_PG_FLAGS_OFFLOAD; } out: spin_unlock_bh(&br->multicast_lock); err: kfree(priv); } static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, const struct net_bridge_mdb_entry *mp) { if (mp->addr.proto == htons(ETH_P_IP)) ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); #if IS_ENABLED(CONFIG_IPV6) else if (mp->addr.proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); #endif else ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); mdb->vid = mp->addr.vid; } static void br_switchdev_host_mdb_one(struct net_device *dev, struct net_device *lower_dev, struct net_bridge_mdb_entry *mp, int type) { struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_HOST_MDB, .flags = SWITCHDEV_F_DEFER, .orig_dev = dev, }, }; br_switchdev_mdb_populate(&mdb, mp); switch (type) { case RTM_NEWMDB: switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); break; case RTM_DELMDB: switchdev_port_obj_del(lower_dev, &mdb.obj); break; } } static void br_switchdev_host_mdb(struct net_device *dev, struct net_bridge_mdb_entry *mp, int type) { struct net_device *lower_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, lower_dev, iter) br_switchdev_host_mdb_one(dev, lower_dev, mp, type); } static int br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, const struct switchdev_obj_port_mdb *mdb, unsigned long action, const void *ctx, struct netlink_ext_ack *extack) { struct switchdev_notifier_port_obj_info obj_info = { .info = { .dev = dev, .extack = extack, .ctx = ctx, }, .obj = &mdb->obj, }; int err; err = nb->notifier_call(nb, action, &obj_info); return notifier_to_errno(err); } static int br_switchdev_mdb_queue_one(struct list_head *mdb_list, struct net_device *dev, unsigned long action, enum switchdev_obj_id id, const struct net_bridge_mdb_entry *mp, struct net_device *orig_dev) { struct switchdev_obj_port_mdb mdb = { .obj = { .id = id, .orig_dev = orig_dev, }, }; struct switchdev_obj_port_mdb *pmdb; br_switchdev_mdb_populate(&mdb, mp); if (action == SWITCHDEV_PORT_OBJ_ADD && switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) { /* This event is already in the deferred queue of * events, so this replay must be elided, lest the * driver receives duplicate events for it. This can * only happen when replaying additions, since * modifications are always immediately visible in * br->mdb_list, whereas actual event delivery may be * delayed. */ return 0; } pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC); if (!pmdb) return -ENOMEM; list_add_tail(&pmdb->obj.list, mdb_list); return 0; } void br_switchdev_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { struct br_switchdev_mdb_complete_info *complete_info; struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_PORT_MDB, .flags = SWITCHDEV_F_DEFER, }, }; if (!pg) return br_switchdev_host_mdb(dev, mp, type); br_switchdev_mdb_populate(&mdb, mp); mdb.obj.orig_dev = pg->key.port->dev; switch (type) { case RTM_NEWMDB: complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); if (!complete_info) break; complete_info->port = pg->key.port; complete_info->ip = mp->addr; mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_switchdev_mdb_complete; if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL)) kfree(complete_info); break; case RTM_DELMDB: switchdev_port_obj_del(pg->key.port->dev, &mdb.obj); break; } } #endif static int br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, const void *ctx, bool adding, struct notifier_block *nb, struct netlink_ext_ack *extack) { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING const struct net_bridge_mdb_entry *mp; struct switchdev_obj *obj, *tmp; struct net_bridge *br; unsigned long action; LIST_HEAD(mdb_list); int err = 0; ASSERT_RTNL(); if (!nb) return 0; if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) return -EINVAL; br = netdev_priv(br_dev); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) return 0; if (adding) action = SWITCHDEV_PORT_OBJ_ADD; else action = SWITCHDEV_PORT_OBJ_DEL; /* br_switchdev_mdb_queue_one() will take care to not queue a * replay of an event that is already pending in the switchdev * deferred queue. In order to safely determine that, there * must be no new deferred MDB notifications enqueued for the * duration of the MDB scan. Therefore, grab the write-side * lock to avoid racing with any concurrent IGMP/MLD snooping. */ spin_lock_bh(&br->multicast_lock); hlist_for_each_entry(mp, &br->mdb_list, mdb_node) { struct net_bridge_port_group __rcu * const *pp; const struct net_bridge_port_group *p; if (mp->host_joined) { err = br_switchdev_mdb_queue_one(&mdb_list, dev, action, SWITCHDEV_OBJ_ID_HOST_MDB, mp, br_dev); if (err) { spin_unlock_bh(&br->multicast_lock); goto out_free_mdb; } } for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; pp = &p->next) { if (p->key.port->dev != dev) continue; err = br_switchdev_mdb_queue_one(&mdb_list, dev, action, SWITCHDEV_OBJ_ID_PORT_MDB, mp, dev); if (err) { spin_unlock_bh(&br->multicast_lock); goto out_free_mdb; } } } spin_unlock_bh(&br->multicast_lock); list_for_each_entry(obj, &mdb_list, list) { err = br_switchdev_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); if (err == -EOPNOTSUPP) err = 0; if (err) goto out_free_mdb; } out_free_mdb: list_for_each_entry_safe(obj, tmp, &mdb_list, list) { list_del(&obj->list); kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); } if (err) return err; #endif return 0; } static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; int err; err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack); if (err && err != -EOPNOTSUPP) return err; err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); if (err) { /* -EOPNOTSUPP not propagated from MDB replay. */ return err; } err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); if (err && err != -EOPNOTSUPP) return err; return 0; } static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { struct net_device *br_dev = p->br->dev; struct net_device *dev = p->dev; br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL); /* Make sure that the device leaving this bridge has seen all * relevant events before it is disassociated. In the normal * case, when the device is directly attached to the bridge, * this is covered by del_nbp(). If the association was indirect * however, e.g. via a team or bond, and the device is leaving * that intermediate device, then the bridge port remains in * place. */ switchdev_deferred_process(); } /* Let the bridge know that this port is offloaded, so that it can assign a * switchdev hardware domain to it. */ int br_switchdev_port_offload(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, bool tx_fwd_offload, struct netlink_ext_ack *extack) { struct netdev_phys_item_id ppid; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) return err; err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack); if (err) return err; err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); if (err) goto out_switchdev_del; return 0; out_switchdev_del: nbp_switchdev_del(p); return err; } void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb) { nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb); nbp_switchdev_del(p); } int br_switchdev_port_replay(struct net_bridge_port *p, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, struct netlink_ext_ack *extack) { return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); }
28 9 8 25 6 1 5 4 6 6 6 6 24 24 28 26 17 9 25 21 9 8 7 27 9 35 12 7 7 3 12 19 6 5 4 3 19 8 11 10 2 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 // SPDX-License-Identifier: GPL-2.0 #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/utime.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <asm/unistd.h> #include <linux/filelock.h> static bool nsec_valid(long nsec) { if (nsec == UTIME_OMIT || nsec == UTIME_NOW) return true; return nsec >= 0 && nsec <= 999999999; } int vfs_utimes(const struct path *path, struct timespec64 *times) { int error; struct iattr newattrs; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; if (times) { if (!nsec_valid(times[0].tv_nsec) || !nsec_valid(times[1].tv_nsec)) return -EINVAL; if (times[0].tv_nsec == UTIME_NOW && times[1].tv_nsec == UTIME_NOW) times = NULL; } error = mnt_want_write(path->mnt); if (error) goto out; newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; if (times) { if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { newattrs.ia_atime = times[0]; newattrs.ia_valid |= ATTR_ATIME_SET; } if (times[1].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_MTIME; else if (times[1].tv_nsec != UTIME_NOW) { newattrs.ia_mtime = times[1]; newattrs.ia_valid |= ATTR_MTIME_SET; } /* * Tell setattr_prepare(), that this is an explicit time * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET * were used. */ newattrs.ia_valid |= ATTR_TIMES_SET; } else { newattrs.ia_valid |= ATTR_TOUCH; } retry_deleg: inode_lock(inode); error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path->mnt); out: return error; } static int do_utimes_path(int dfd, const char __user *filename, struct timespec64 *times, int flags) { struct path path; int lookup_flags = 0, error; if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) return -EINVAL; if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) return error; error = vfs_utimes(&path, times); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } static int do_utimes_fd(int fd, struct timespec64 *times, int flags) { if (flags) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; return vfs_utimes(&fd_file(f)->f_path, times); } /* * do_utimes - change times on filename or file descriptor * @dfd: open file descriptor, -1 or AT_FDCWD * @filename: path name or NULL * @times: new times or NULL * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment) * * If filename is NULL and dfd refers to an open file, then operate on * the file. Otherwise look up filename, possibly using dfd as a * starting point. * * If times==NULL, set access and modification to current time, * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags) { if (filename == NULL && dfd != AT_FDCWD) return do_utimes_fd(dfd, times, flags); return do_utimes_path(dfd, filename, times, flags); } SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, struct __kernel_timespec __user *, utimes, int, flags) { struct timespec64 tstimes[2]; if (utimes) { if ((get_timespec64(&tstimes[0], &utimes[0]) || get_timespec64(&tstimes[1], &utimes[1]))) return -EFAULT; /* Nothing to do, we must not even check the path. */ if (tstimes[0].tv_nsec == UTIME_OMIT && tstimes[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME /* * futimesat(), utimes() and utime() are older versions of utimensat() * that are provided for compatibility with traditional C libraries. * On modern architectures, we always use libc wrappers around * utimensat() instead. */ static long do_futimesat(int dfd, const char __user *filename, struct __kernel_old_timeval __user *utimes) { struct __kernel_old_timeval times[2]; struct timespec64 tstimes[2]; if (utimes) { if (copy_from_user(&times, utimes, sizeof(times))) return -EFAULT; /* This test is needed to catch all invalid values. If we would test only in do_utimes we would miss those invalid values truncated by the multiplication with 1000. Note that we also catch UTIME_{NOW,OMIT} here which are only valid for utimensat. */ if (times[0].tv_usec >= 1000000 || times[0].tv_usec < 0 || times[1].tv_usec >= 1000000 || times[1].tv_usec < 0) return -EINVAL; tstimes[0].tv_sec = times[0].tv_sec; tstimes[0].tv_nsec = 1000 * times[0].tv_usec; tstimes[1].tv_sec = times[1].tv_sec; tstimes[1].tv_nsec = 1000 * times[1].tv_usec; } return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); } SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(dfd, filename, utimes); } SYSCALL_DEFINE2(utimes, char __user *, filename, struct __kernel_old_timeval __user *, utimes) { return do_futimesat(AT_FDCWD, filename, utimes); } SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) { struct timespec64 tv[2]; if (times) { if (get_user(tv[0].tv_sec, &times->actime) || get_user(tv[1].tv_sec, &times->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, times ? tv : NULL, 0); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. */ #ifdef __ARCH_WANT_SYS_UTIME32 SYSCALL_DEFINE2(utime32, const char __user *, filename, struct old_utimbuf32 __user *, t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t->actime) || get_user(tv[1].tv_sec, &t->modtime)) return -EFAULT; tv[0].tv_nsec = 0; tv[1].tv_nsec = 0; } return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); } #endif SYSCALL_DEFINE4(utimensat_time32, unsigned int, dfd, const char __user *, filename, struct old_timespec32 __user *, t, int, flags) { struct timespec64 tv[2]; if (t) { if (get_old_timespec32(&tv[0], &t[0]) || get_old_timespec32(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) return 0; } return do_utimes(dfd, filename, t ? tv : NULL, flags); } #ifdef __ARCH_WANT_SYS_UTIME32 static long do_compat_futimesat(unsigned int dfd, const char __user *filename, struct old_timeval32 __user *t) { struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || get_user(tv[0].tv_nsec, &t[0].tv_usec) || get_user(tv[1].tv_sec, &t[1].tv_sec) || get_user(tv[1].tv_nsec, &t[1].tv_usec)) return -EFAULT; if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) return -EINVAL; tv[0].tv_nsec *= 1000; tv[1].tv_nsec *= 1000; } return do_utimes(dfd, filename, t ? tv : NULL, 0); } SYSCALL_DEFINE3(futimesat_time32, unsigned int, dfd, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(dfd, filename, t); } SYSCALL_DEFINE2(utimes_time32, const char __user *, filename, struct old_timeval32 __user *, t) { return do_compat_futimesat(AT_FDCWD, filename, t); } #endif #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 // SPDX-License-Identifier: GPL-2.0-or-later /* * mount.c - operations for initializing and mounting configfs. * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/configfs.h> #include "configfs_internal.h" /* Random magic number */ #define CONFIGFS_MAGIC 0x62656570 static struct vfsmount *configfs_mount = NULL; struct kmem_cache *configfs_dir_cachep; static int configfs_mnt_count = 0; static void configfs_free_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); free_inode_nonrcu(inode); } static const struct super_operations configfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .free_inode = configfs_free_inode, }; static struct config_group configfs_root_group = { .cg_item = { .ci_namebuf = "root", .ci_name = configfs_root_group.cg_item.ci_namebuf, }, }; int configfs_is_root(struct config_item *item) { return item == &configfs_root_group.cg_item; } static struct configfs_dirent configfs_root = { .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling), .s_children = LIST_HEAD_INIT(configfs_root.s_children), .s_element = &configfs_root_group.cg_item, .s_type = CONFIGFS_ROOT, .s_iattr = NULL, }; static int configfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dentry *root; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = CONFIGFS_MAGIC; sb->s_op = &configfs_ops; sb->s_time_gran = 1; inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, &configfs_root, sb); if (inode) { inode->i_op = &configfs_root_inode_operations; inode->i_fop = &configfs_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); } else { pr_debug("could not get root inode\n"); return -ENOMEM; } root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n",__func__); return -ENOMEM; } config_group_init(&configfs_root_group); configfs_root_group.cg_item.ci_dentry = root; root->d_fsdata = &configfs_root; sb->s_root = root; sb->s_d_op = &configfs_dentry_ops; /* the rest get that */ return 0; } static int configfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, configfs_fill_super); } static const struct fs_context_operations configfs_context_ops = { .get_tree = configfs_get_tree, }; static int configfs_init_fs_context(struct fs_context *fc) { fc->ops = &configfs_context_ops; return 0; } static struct file_system_type configfs_fs_type = { .owner = THIS_MODULE, .name = "configfs", .init_fs_context = configfs_init_fs_context, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("configfs"); struct dentry *configfs_pin_fs(void) { int err = simple_pin_fs(&configfs_fs_type, &configfs_mount, &configfs_mnt_count); return err ? ERR_PTR(err) : configfs_mount->mnt_root; } void configfs_release_fs(void) { simple_release_fs(&configfs_mount, &configfs_mnt_count); } static int __init configfs_init(void) { int err = -ENOMEM; configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", sizeof(struct configfs_dirent), 0, 0, NULL); if (!configfs_dir_cachep) goto out; err = sysfs_create_mount_point(kernel_kobj, "config"); if (err) goto out2; err = register_filesystem(&configfs_fs_type); if (err) goto out3; return 0; out3: pr_err("Unable to register filesystem!\n"); sysfs_remove_mount_point(kernel_kobj, "config"); out2: kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; out: return err; } static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); sysfs_remove_mount_point(kernel_kobj, "config"); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; } MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); core_initcall(configfs_init); module_exit(configfs_exit);
6 2 434 124 2234 2969 692 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sock #if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SOCK_H #include <net/sock.h> #include <net/ipv6.h> #include <linux/tracepoint.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <trace/events/net_probe_common.h> #define family_names \ EM(AF_INET) \ EMe(AF_INET6) /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) #define tcp_state_names \ EM(TCP_ESTABLISHED) \ EM(TCP_SYN_SENT) \ EM(TCP_SYN_RECV) \ EM(TCP_FIN_WAIT1) \ EM(TCP_FIN_WAIT2) \ EM(TCP_TIME_WAIT) \ EM(TCP_CLOSE) \ EM(TCP_CLOSE_WAIT) \ EM(TCP_LAST_ACK) \ EM(TCP_LISTEN) \ EM(TCP_CLOSING) \ EMe(TCP_NEW_SYN_RECV) #define skmem_kind_names \ EM(SK_MEM_SEND) \ EMe(SK_MEM_RECV) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a) TRACE_DEFINE_ENUM(a); #define EMe(a) TRACE_DEFINE_ENUM(a); family_names inet_protocol_names tcp_state_names skmem_kind_names #undef EM #undef EMe #define EM(a) { a, #a }, #define EMe(a) { a, #a } #define show_family_name(val) \ __print_symbolic(val, family_names) #define show_inet_protocol_name(val) \ __print_symbolic(val, inet_protocol_names) #define show_tcp_state_name(val) \ __print_symbolic(val, tcp_state_names) #define show_skmem_kind_names(val) \ __print_symbolic(val, skmem_kind_names) TRACE_EVENT(sock_rcvqueue_full, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(int, rmem_alloc) __field(unsigned int, truesize) __field(int, sk_rcvbuf) ), TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf) ); TRACE_EVENT(sock_exceed_buf_limit, TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind), TP_ARGS(sk, prot, allocated, kind), TP_STRUCT__entry( __array(char, name, 32) __array(long, sysctl_mem, 3) __field(long, allocated) __field(int, sysctl_rmem) __field(int, rmem_alloc) __field(int, sysctl_wmem) __field(int, wmem_alloc) __field(int, wmem_queued) __field(int, kind) ), TP_fast_assign( strscpy(__entry->name, prot->name, 32); __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]); __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]); __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]); __entry->allocated = allocated; __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s", __entry->name, __entry->sysctl_mem[0], __entry->sysctl_mem[1], __entry->sysctl_mem[2], __entry->allocated, __entry->sysctl_rmem, __entry->rmem_alloc, __entry->sysctl_wmem, __entry->wmem_alloc, __entry->wmem_queued, show_skmem_kind_names(__entry->kind) ) ); TRACE_EVENT(inet_sock_set_state, TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), TP_ARGS(sk, oldstate, newstate), TP_STRUCT__entry( __field(const void *, skaddr) __field(int, oldstate) __field(int, newstate) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->oldstate = oldstate; __entry->newstate = newstate; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->oldstate), show_tcp_state_name(__entry->newstate)) ); TRACE_EVENT(inet_sk_error_report, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(int, error) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u16, protocol) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->error = sk->sk_err; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d", show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->error) ); TRACE_EVENT(sk_data_ready, TP_PROTO(const struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, family) __field(__u16, protocol) __field(unsigned long, ip) ), TP_fast_assign( __entry->skaddr = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ip = _RET_IP_; ), TP_printk("family=%u protocol=%u func=%ps", __entry->family, __entry->protocol, (void *)__entry->ip) ); /* * sock send/recv msg length */ DECLARE_EVENT_CLASS(sock_msg_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags), TP_STRUCT__entry( __field(void *, sk) __field(__u16, family) __field(__u16, protocol) __field(int, ret) __field(int, flags) ), TP_fast_assign( __entry->sk = sk; __entry->family = sk->sk_family; __entry->protocol = sk->sk_protocol; __entry->ret = ret; __entry->flags = flags; ), TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x", __entry->sk, show_family_name(__entry->family), show_inet_protocol_name(__entry->protocol), !(__entry->flags & MSG_PEEK) ? (__entry->ret > 0 ? __entry->ret : 0) : 0, __entry->ret < 0 ? __entry->ret : 0, __entry->flags) ); DEFINE_EVENT(sock_msg_length, sock_send_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); DEFINE_EVENT(sock_msg_length, sock_recv_length, TP_PROTO(struct sock *sk, int ret, int flags), TP_ARGS(sk, ret, flags) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
4 4 2 2 2 4 4 4 3 4 1 1 4 4 4 4 1 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) { error = -EBUSY; goto err; } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) goto err; kobject_get(p->kobj.parent); return 0; err: kfree_const(p->kobj.name); p->kobj.name = NULL; return error; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @cdev: the cdev structure * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev);
58 50 50 58 49 50 53 58 39 34 11 23 11 20 39 38 843 827 179 657 658 41 5 40 41 40 33 33 33 32 32 58 58 57 58 58 58 58 57 58 58 58 58 58 67 67 67 19 19 15 15 15 15 5 65 65 67 66 21 37 37 37 37 36 36 36 36 36 36 36 36 36 36 36 38 38 38 38 1400 1390 1370 162 145 146 1391 1400 1396 451 453 453 1894 513 1883 1885 1884 514 1878 1673 1879 1894 1872 1867 1870 1866 58 41 41 11 3 13 4 9 7 41 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/util.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/slab.h> #include <linux/rculist.h> #include "common.h" /* Lock for protecting policy. */ DEFINE_MUTEX(tomoyo_policy_lock); /* Has /sbin/init started? */ bool tomoyo_policy_loaded; /* * Mapping table from "enum tomoyo_mac_index" to * "enum tomoyo_mac_category_index". */ const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = { /* CONFIG::file group */ [TOMOYO_MAC_FILE_EXECUTE] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_OPEN] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CREATE] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_UNLINK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_GETATTR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKDIR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_RMDIR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKFIFO] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKSOCK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_TRUNCATE] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_SYMLINK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKBLOCK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MKCHAR] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_LINK] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_RENAME] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHMOD] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHOWN] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHGRP] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_IOCTL] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_CHROOT] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_MOUNT] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_UMOUNT] = TOMOYO_MAC_CATEGORY_FILE, [TOMOYO_MAC_FILE_PIVOT_ROOT] = TOMOYO_MAC_CATEGORY_FILE, /* CONFIG::network group */ [TOMOYO_MAC_NETWORK_INET_STREAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_RAW_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_INET_RAW_SEND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN] = TOMOYO_MAC_CATEGORY_NETWORK, [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = TOMOYO_MAC_CATEGORY_NETWORK, /* CONFIG::misc group */ [TOMOYO_MAC_ENVIRON] = TOMOYO_MAC_CATEGORY_MISC, }; /** * tomoyo_convert_time - Convert time_t to YYYY/MM/DD hh/mm/ss. * * @time64: Seconds since 1970/01/01 00:00:00. * @stamp: Pointer to "struct tomoyo_time". * * Returns nothing. */ void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp) { struct tm tm; time64_to_tm(time64, 0, &tm); stamp->sec = tm.tm_sec; stamp->min = tm.tm_min; stamp->hour = tm.tm_hour; stamp->day = tm.tm_mday; stamp->month = tm.tm_mon + 1; stamp->year = tm.tm_year + 1900; } /** * tomoyo_permstr - Find permission keywords. * * @string: String representation for permissions in foo/bar/buz format. * @keyword: Keyword to find from @string/ * * Returns true if @keyword was found in @string, false otherwise. * * This function assumes that strncmp(w1, w2, strlen(w1)) != 0 if w1 != w2. */ bool tomoyo_permstr(const char *string, const char *keyword) { const char *cp = strstr(string, keyword); if (cp) return cp == string || *(cp - 1) == '/'; return false; } /** * tomoyo_read_token - Read a word from a line. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns a word on success, "" otherwise. * * To allow the caller to skip NULL check, this function returns "" rather than * NULL if there is no more words to read. */ char *tomoyo_read_token(struct tomoyo_acl_param *param) { char *pos = param->data; char *del = strchr(pos, ' '); if (del) *del++ = '\0'; else del = pos + strlen(pos); param->data = del; return pos; } static bool tomoyo_correct_path2(const char *filename, const size_t len); /** * tomoyo_get_domainname - Read a domainname from a line. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns a domainname on success, NULL otherwise. */ const struct tomoyo_path_info *tomoyo_get_domainname (struct tomoyo_acl_param *param) { char *start = param->data; char *pos = start; while (*pos) { if (*pos++ != ' ' || tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos)) continue; *(pos - 1) = '\0'; break; } param->data = pos; if (tomoyo_correct_domain(start)) return tomoyo_get_name(start); return NULL; } /** * tomoyo_parse_ulong - Parse an "unsigned long" value. * * @result: Pointer to "unsigned long". * @str: Pointer to string to parse. * * Returns one of values in "enum tomoyo_value_type". * * The @src is updated to point the first character after the value * on success. */ u8 tomoyo_parse_ulong(unsigned long *result, char **str) { const char *cp = *str; char *ep; int base = 10; if (*cp == '0') { char c = *(cp + 1); if (c == 'x' || c == 'X') { base = 16; cp += 2; } else if (c >= '0' && c <= '7') { base = 8; cp++; } } *result = simple_strtoul(cp, &ep, base); if (cp == ep) return TOMOYO_VALUE_TYPE_INVALID; *str = ep; switch (base) { case 16: return TOMOYO_VALUE_TYPE_HEXADECIMAL; case 8: return TOMOYO_VALUE_TYPE_OCTAL; default: return TOMOYO_VALUE_TYPE_DECIMAL; } } /** * tomoyo_print_ulong - Print an "unsigned long" value. * * @buffer: Pointer to buffer. * @buffer_len: Size of @buffer. * @value: An "unsigned long" value. * @type: Type of @value. * * Returns nothing. */ void tomoyo_print_ulong(char *buffer, const int buffer_len, const unsigned long value, const u8 type) { if (type == TOMOYO_VALUE_TYPE_DECIMAL) snprintf(buffer, buffer_len, "%lu", value); else if (type == TOMOYO_VALUE_TYPE_OCTAL) snprintf(buffer, buffer_len, "0%lo", value); else if (type == TOMOYO_VALUE_TYPE_HEXADECIMAL) snprintf(buffer, buffer_len, "0x%lX", value); else snprintf(buffer, buffer_len, "type(%u)", type); } /** * tomoyo_parse_name_union - Parse a tomoyo_name_union. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ bool tomoyo_parse_name_union(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename; if (param->data[0] == '@') { param->data++; ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP); return ptr->group != NULL; } filename = tomoyo_read_token(param); if (!tomoyo_correct_word(filename)) return false; ptr->filename = tomoyo_get_name(filename); return ptr->filename != NULL; } /** * tomoyo_parse_number_union - Parse a tomoyo_number_union. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_number_union". * * Returns true on success, false otherwise. */ bool tomoyo_parse_number_union(struct tomoyo_acl_param *param, struct tomoyo_number_union *ptr) { char *data; u8 type; unsigned long v; memset(ptr, 0, sizeof(*ptr)); if (param->data[0] == '@') { param->data++; ptr->group = tomoyo_get_group(param, TOMOYO_NUMBER_GROUP); return ptr->group != NULL; } data = tomoyo_read_token(param); type = tomoyo_parse_ulong(&v, &data); if (type == TOMOYO_VALUE_TYPE_INVALID) return false; ptr->values[0] = v; ptr->value_type[0] = type; if (!*data) { ptr->values[1] = v; ptr->value_type[1] = type; return true; } if (*data++ != '-') return false; type = tomoyo_parse_ulong(&v, &data); if (type == TOMOYO_VALUE_TYPE_INVALID || *data || ptr->values[0] > v) return false; ptr->values[1] = v; ptr->value_type[1] = type; return true; } /** * tomoyo_byte_range - Check whether the string is a \ooo style octal value. * * @str: Pointer to the string. * * Returns true if @str is a \ooo style octal value, false otherwise. * * TOMOYO uses \ooo style representation for 0x01 - 0x20 and 0x7F - 0xFF. * This function verifies that \ooo is in valid range. */ static inline bool tomoyo_byte_range(const char *str) { return *str >= '0' && *str++ <= '3' && *str >= '0' && *str++ <= '7' && *str >= '0' && *str <= '7'; } /** * tomoyo_alphabet_char - Check whether the character is an alphabet. * * @c: The character to check. * * Returns true if @c is an alphabet character, false otherwise. */ static inline bool tomoyo_alphabet_char(const char c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); } /** * tomoyo_make_byte - Make byte value from three octal characters. * * @c1: The first character. * @c2: The second character. * @c3: The third character. * * Returns byte value. */ static inline u8 tomoyo_make_byte(const u8 c1, const u8 c2, const u8 c3) { return ((c1 - '0') << 6) + ((c2 - '0') << 3) + (c3 - '0'); } /** * tomoyo_valid - Check whether the character is a valid char. * * @c: The character to check. * * Returns true if @c is a valid character, false otherwise. */ static inline bool tomoyo_valid(const unsigned char c) { return c > ' ' && c < 127; } /** * tomoyo_invalid - Check whether the character is an invalid char. * * @c: The character to check. * * Returns true if @c is an invalid character, false otherwise. */ static inline bool tomoyo_invalid(const unsigned char c) { return c && (c <= ' ' || c >= 127); } /** * tomoyo_str_starts - Check whether the given string starts with the given keyword. * * @src: Pointer to pointer to the string. * @find: Pointer to the keyword. * * Returns true if @src starts with @find, false otherwise. * * The @src is updated to point the first character after the @find * if @src starts with @find. */ bool tomoyo_str_starts(char **src, const char *find) { const int len = strlen(find); char *tmp = *src; if (strncmp(tmp, find, len)) return false; tmp += len; *src = tmp; return true; } /** * tomoyo_normalize_line - Format string. * * @buffer: The line to normalize. * * Leading and trailing whitespaces are removed. * Multiple whitespaces are packed into single space. * * Returns nothing. */ void tomoyo_normalize_line(unsigned char *buffer) { unsigned char *sp = buffer; unsigned char *dp = buffer; bool first = true; while (tomoyo_invalid(*sp)) sp++; while (*sp) { if (!first) *dp++ = ' '; first = false; while (tomoyo_valid(*sp)) *dp++ = *sp++; while (tomoyo_invalid(*sp)) sp++; } *dp = '\0'; } /** * tomoyo_correct_word2 - Validate a string. * * @string: The string to check. Maybe non-'\0'-terminated. * @len: Length of @string. * * Check whether the given string follows the naming rules. * Returns true if @string follows the naming rules, false otherwise. */ static bool tomoyo_correct_word2(const char *string, size_t len) { u8 recursion = 20; const char *const start = string; bool in_repetition = false; if (!len) goto out; while (len--) { unsigned char c = *string++; if (c == '\\') { if (!len--) goto out; c = *string++; if (c >= '0' && c <= '3') { unsigned char d; unsigned char e; if (!len-- || !len--) goto out; d = *string++; e = *string++; if (d < '0' || d > '7' || e < '0' || e > '7') goto out; c = tomoyo_make_byte(c, d, e); if (c <= ' ' || c >= 127) continue; goto out; } switch (c) { case '\\': /* "\\" */ case '+': /* "\+" */ case '?': /* "\?" */ case 'x': /* "\x" */ case 'a': /* "\a" */ case '-': /* "\-" */ continue; } if (!recursion--) goto out; switch (c) { case '*': /* "\*" */ case '@': /* "\@" */ case '$': /* "\$" */ case 'X': /* "\X" */ case 'A': /* "\A" */ continue; case '{': /* "/\{" */ if (string - 3 < start || *(string - 3) != '/') goto out; in_repetition = true; continue; case '}': /* "\}/" */ if (*string != '/') goto out; if (!in_repetition) goto out; in_repetition = false; continue; } goto out; } else if (in_repetition && c == '/') { goto out; } else if (c <= ' ' || c >= 127) { goto out; } } if (in_repetition) goto out; return true; out: return false; } /** * tomoyo_correct_word - Validate a string. * * @string: The string to check. * * Check whether the given string follows the naming rules. * Returns true if @string follows the naming rules, false otherwise. */ bool tomoyo_correct_word(const char *string) { return tomoyo_correct_word2(string, strlen(string)); } /** * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules. * * @filename: The pathname to check. * @len: Length of @filename. * * Returns true if @filename follows the naming rules, false otherwise. */ static bool tomoyo_correct_path2(const char *filename, const size_t len) { const char *cp1 = memchr(filename, '/', len); const char *cp2 = memchr(filename, '.', len); return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len); } /** * tomoyo_correct_path - Validate a pathname. * * @filename: The pathname to check. * * Check whether the given pathname follows the naming rules. * Returns true if @filename follows the naming rules, false otherwise. */ bool tomoyo_correct_path(const char *filename) { return tomoyo_correct_path2(filename, strlen(filename)); } /** * tomoyo_correct_domain - Check whether the given domainname follows the naming rules. * * @domainname: The domainname to check. * * Returns true if @domainname follows the naming rules, false otherwise. */ bool tomoyo_correct_domain(const unsigned char *domainname) { if (!domainname || !tomoyo_domain_def(domainname)) return false; domainname = strchr(domainname, ' '); if (!domainname++) return true; while (1) { const unsigned char *cp = strchr(domainname, ' '); if (!cp) break; if (!tomoyo_correct_path2(domainname, cp - domainname)) return false; domainname = cp + 1; } return tomoyo_correct_path(domainname); } /** * tomoyo_domain_def - Check whether the given token can be a domainname. * * @buffer: The token to check. * * Returns true if @buffer possibly be a domainname, false otherwise. */ bool tomoyo_domain_def(const unsigned char *buffer) { const unsigned char *cp; int len; if (*buffer != '<') return false; cp = strchr(buffer, ' '); if (!cp) len = strlen(buffer); else len = cp - buffer; if (buffer[len - 1] != '>' || !tomoyo_correct_word2(buffer + 1, len - 2)) return false; return true; } /** * tomoyo_find_domain - Find a domain by the given name. * * @domainname: The domainname to find. * * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise. * * Caller holds tomoyo_read_lock(). */ struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname) { struct tomoyo_domain_info *domain; struct tomoyo_path_info name; name.name = domainname; tomoyo_fill_path_info(&name); list_for_each_entry_rcu(domain, &tomoyo_domain_list, list, srcu_read_lock_held(&tomoyo_ss)) { if (!domain->is_deleted && !tomoyo_pathcmp(&name, domain->domainname)) return domain; } return NULL; } /** * tomoyo_const_part_length - Evaluate the initial length without a pattern in a token. * * @filename: The string to evaluate. * * Returns the initial length without a pattern in @filename. */ static int tomoyo_const_part_length(const char *filename) { char c; int len = 0; if (!filename) return 0; while ((c = *filename++) != '\0') { if (c != '\\') { len++; continue; } c = *filename++; switch (c) { case '\\': /* "\\" */ len += 2; continue; case '0': /* "\ooo" */ case '1': case '2': case '3': c = *filename++; if (c < '0' || c > '7') break; c = *filename++; if (c < '0' || c > '7') break; len += 4; continue; } break; } return len; } /** * tomoyo_fill_path_info - Fill in "struct tomoyo_path_info" members. * * @ptr: Pointer to "struct tomoyo_path_info" to fill in. * * The caller sets "struct tomoyo_path_info"->name. */ void tomoyo_fill_path_info(struct tomoyo_path_info *ptr) { const char *name = ptr->name; const int len = strlen(name); ptr->const_len = tomoyo_const_part_length(name); ptr->is_dir = len && (name[len - 1] == '/'); ptr->is_patterned = (ptr->const_len < len); ptr->hash = full_name_hash(NULL, name, len); } /** * tomoyo_file_matches_pattern2 - Pattern matching without '/' character and "\-" pattern. * * @filename: The start of string to check. * @filename_end: The end of string to check. * @pattern: The start of pattern to compare. * @pattern_end: The end of pattern to compare. * * Returns true if @filename matches @pattern, false otherwise. */ static bool tomoyo_file_matches_pattern2(const char *filename, const char *filename_end, const char *pattern, const char *pattern_end) { while (filename < filename_end && pattern < pattern_end) { char c; int i; int j; if (*pattern != '\\') { if (*filename++ != *pattern++) return false; continue; } c = *filename; pattern++; switch (*pattern) { case '?': if (c == '/') { return false; } else if (c == '\\') { if (filename[1] == '\\') filename++; else if (tomoyo_byte_range(filename + 1)) filename += 3; else return false; } break; case '\\': if (c != '\\') return false; if (*++filename != '\\') return false; break; case '+': if (!isdigit(c)) return false; break; case 'x': if (!isxdigit(c)) return false; break; case 'a': if (!tomoyo_alphabet_char(c)) return false; break; case '0': case '1': case '2': case '3': if (c == '\\' && tomoyo_byte_range(filename + 1) && strncmp(filename + 1, pattern, 3) == 0) { filename += 3; pattern += 2; break; } return false; /* Not matched. */ case '*': case '@': for (i = 0; i <= filename_end - filename; i++) { if (tomoyo_file_matches_pattern2( filename + i, filename_end, pattern + 1, pattern_end)) return true; c = filename[i]; if (c == '.' && *pattern == '@') break; if (c != '\\') continue; if (filename[i + 1] == '\\') i++; else if (tomoyo_byte_range(filename + i + 1)) i += 3; else break; /* Bad pattern. */ } return false; /* Not matched. */ default: j = 0; c = *pattern; if (c == '$') { while (isdigit(filename[j])) j++; } else if (c == 'X') { while (isxdigit(filename[j])) j++; } else if (c == 'A') { while (tomoyo_alphabet_char(filename[j])) j++; } for (i = 1; i <= j; i++) { if (tomoyo_file_matches_pattern2( filename + i, filename_end, pattern + 1, pattern_end)) return true; } return false; /* Not matched or bad pattern. */ } filename++; pattern++; } while (*pattern == '\\' && (*(pattern + 1) == '*' || *(pattern + 1) == '@')) pattern += 2; return filename == filename_end && pattern == pattern_end; } /** * tomoyo_file_matches_pattern - Pattern matching without '/' character. * * @filename: The start of string to check. * @filename_end: The end of string to check. * @pattern: The start of pattern to compare. * @pattern_end: The end of pattern to compare. * * Returns true if @filename matches @pattern, false otherwise. */ static bool tomoyo_file_matches_pattern(const char *filename, const char *filename_end, const char *pattern, const char *pattern_end) { const char *pattern_start = pattern; bool first = true; bool result; while (pattern < pattern_end - 1) { /* Split at "\-" pattern. */ if (*pattern++ != '\\' || *pattern++ != '-') continue; result = tomoyo_file_matches_pattern2(filename, filename_end, pattern_start, pattern - 2); if (first) result = !result; if (result) return false; first = false; pattern_start = pattern; } result = tomoyo_file_matches_pattern2(filename, filename_end, pattern_start, pattern_end); return first ? result : !result; } /** * tomoyo_path_matches_pattern2 - Do pathname pattern matching. * * @f: The start of string to check. * @p: The start of pattern to compare. * * Returns true if @f matches @p, false otherwise. */ static bool tomoyo_path_matches_pattern2(const char *f, const char *p) { const char *f_delimiter; const char *p_delimiter; while (*f && *p) { f_delimiter = strchr(f, '/'); if (!f_delimiter) f_delimiter = f + strlen(f); p_delimiter = strchr(p, '/'); if (!p_delimiter) p_delimiter = p + strlen(p); if (*p == '\\' && *(p + 1) == '{') goto recursive; if (!tomoyo_file_matches_pattern(f, f_delimiter, p, p_delimiter)) return false; f = f_delimiter; if (*f) f++; p = p_delimiter; if (*p) p++; } /* Ignore trailing "\*" and "\@" in @pattern. */ while (*p == '\\' && (*(p + 1) == '*' || *(p + 1) == '@')) p += 2; return !*f && !*p; recursive: /* * The "\{" pattern is permitted only after '/' character. * This guarantees that below "*(p - 1)" is safe. * Also, the "\}" pattern is permitted only before '/' character * so that "\{" + "\}" pair will not break the "\-" operator. */ if (*(p - 1) != '/' || p_delimiter <= p + 3 || *p_delimiter != '/' || *(p_delimiter - 1) != '}' || *(p_delimiter - 2) != '\\') return false; /* Bad pattern. */ do { /* Compare current component with pattern. */ if (!tomoyo_file_matches_pattern(f, f_delimiter, p + 2, p_delimiter - 2)) break; /* Proceed to next component. */ f = f_delimiter; if (!*f) break; f++; /* Continue comparison. */ if (tomoyo_path_matches_pattern2(f, p_delimiter + 1)) return true; f_delimiter = strchr(f, '/'); } while (f_delimiter); return false; /* Not matched. */ } /** * tomoyo_path_matches_pattern - Check whether the given filename matches the given pattern. * * @filename: The filename to check. * @pattern: The pattern to compare. * * Returns true if matches, false otherwise. * * The following patterns are available. * \\ \ itself. * \ooo Octal representation of a byte. * \* Zero or more repetitions of characters other than '/'. * \@ Zero or more repetitions of characters other than '/' or '.'. * \? 1 byte character other than '/'. * \$ One or more repetitions of decimal digits. * \+ 1 decimal digit. * \X One or more repetitions of hexadecimal digits. * \x 1 hexadecimal digit. * \A One or more repetitions of alphabet characters. * \a 1 alphabet character. * * \- Subtraction operator. * * /\{dir\}/ '/' + 'One or more repetitions of dir/' (e.g. /dir/ /dir/dir/ * /dir/dir/dir/ ). */ bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename, const struct tomoyo_path_info *pattern) { const char *f = filename->name; const char *p = pattern->name; const int len = pattern->const_len; /* If @pattern doesn't contain pattern, I can use strcmp(). */ if (!pattern->is_patterned) return !tomoyo_pathcmp(filename, pattern); /* Don't compare directory and non-directory. */ if (filename->is_dir != pattern->is_dir) return false; /* Compare the initial length without patterns. */ if (strncmp(f, p, len)) return false; f += len; p += len; return tomoyo_path_matches_pattern2(f, p); } /** * tomoyo_get_exe - Get tomoyo_realpath() of current process. * * Returns the tomoyo_realpath() of current process on success, NULL otherwise. * * This function uses kzalloc(), so the caller must call kfree() * if this function didn't return NULL. */ const char *tomoyo_get_exe(void) { struct file *exe_file; const char *cp; struct mm_struct *mm = current->mm; if (!mm) return NULL; exe_file = get_mm_exe_file(mm); if (!exe_file) return NULL; cp = tomoyo_realpath_from_path(&exe_file->f_path); fput(exe_file); return cp; } /** * tomoyo_get_mode - Get MAC mode. * * @ns: Pointer to "struct tomoyo_policy_namespace". * @profile: Profile number. * @index: Index number of functionality. * * Returns mode. */ int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile, const u8 index) { u8 mode; struct tomoyo_profile *p; if (!tomoyo_policy_loaded) return TOMOYO_CONFIG_DISABLED; p = tomoyo_profile(ns, profile); mode = p->config[index]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->config[tomoyo_index2category[index] + TOMOYO_MAX_MAC_INDEX]; if (mode == TOMOYO_CONFIG_USE_DEFAULT) mode = p->default_config; return mode & 3; } /** * tomoyo_init_request_info - Initialize "struct tomoyo_request_info" members. * * @r: Pointer to "struct tomoyo_request_info" to initialize. * @domain: Pointer to "struct tomoyo_domain_info". NULL for tomoyo_domain(). * @index: Index number of functionality. * * Returns mode. */ int tomoyo_init_request_info(struct tomoyo_request_info *r, struct tomoyo_domain_info *domain, const u8 index) { u8 profile; memset(r, 0, sizeof(*r)); if (!domain) domain = tomoyo_domain(); r->domain = domain; profile = domain->profile; r->profile = profile; r->type = index; r->mode = tomoyo_get_mode(domain->ns, profile, index); return r->mode; } /** * tomoyo_domain_quota_is_ok - Check for domain's quota. * * @r: Pointer to "struct tomoyo_request_info". * * Returns true if the domain is not exceeded quota, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r) { unsigned int count = 0; struct tomoyo_domain_info *domain = r->domain; struct tomoyo_acl_info *ptr; if (r->mode != TOMOYO_CONFIG_LEARNING) return false; if (!domain) return true; if (READ_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED])) return false; list_for_each_entry_rcu(ptr, &domain->acl_info_list, list, srcu_read_lock_held(&tomoyo_ss)) { u16 perm; if (ptr->is_deleted) continue; /* * Reading perm bitmap might race with tomoyo_merge_*() because * caller does not hold tomoyo_policy_lock mutex. But exceeding * max_learning_entry parameter by a few entries does not harm. */ switch (ptr->type) { case TOMOYO_TYPE_PATH_ACL: perm = data_race(container_of(ptr, struct tomoyo_path_acl, head)->perm); break; case TOMOYO_TYPE_PATH2_ACL: perm = data_race(container_of(ptr, struct tomoyo_path2_acl, head)->perm); break; case TOMOYO_TYPE_PATH_NUMBER_ACL: perm = data_race(container_of(ptr, struct tomoyo_path_number_acl, head) ->perm); break; case TOMOYO_TYPE_MKDEV_ACL: perm = data_race(container_of(ptr, struct tomoyo_mkdev_acl, head)->perm); break; case TOMOYO_TYPE_INET_ACL: perm = data_race(container_of(ptr, struct tomoyo_inet_acl, head)->perm); break; case TOMOYO_TYPE_UNIX_ACL: perm = data_race(container_of(ptr, struct tomoyo_unix_acl, head)->perm); break; case TOMOYO_TYPE_MANUAL_TASK_ACL: perm = 0; break; default: perm = 1; } count += hweight16(perm); } if (count < tomoyo_profile(domain->ns, domain->profile)-> pref[TOMOYO_PREF_MAX_LEARNING_ENTRY]) return true; WRITE_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED], true); /* r->granted = false; */ tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]); #ifndef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n", domain->domainname->name); #endif return false; }
10 10 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 // SPDX-License-Identifier: GPL-2.0-only /* * This file provides /sys/class/ieee80211/<wiphy name>/ * and some default attributes. * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2021, 2023-2024 Intel Corporation */ #include <linux/device.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "sysfs.h" #include "core.h" #include "rdev-ops.h" static inline struct cfg80211_registered_device *dev_to_rdev( struct device *dev) { return container_of(dev, struct cfg80211_registered_device, wiphy.dev); } #define SHOW_FMT(name, fmt, member) \ static ssize_t name ## _show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } \ static DEVICE_ATTR_RO(name) SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; return sprintf(buf, "%s\n", wiphy_name(wiphy)); } static DEVICE_ATTR_RO(name); static ssize_t addresses_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; char *start = buf; int i; if (!wiphy->addresses) return sprintf(buf, "%pM\n", wiphy->perm_addr); for (i = 0; i < wiphy->n_addresses; i++) buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr); return buf - start; } static DEVICE_ATTR_RO(addresses); static struct attribute *ieee80211_attrs[] = { &dev_attr_index.attr, &dev_attr_macaddress.attr, &dev_attr_address_mask.attr, &dev_attr_addresses.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(ieee80211); static void wiphy_dev_release(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); cfg80211_dev_free(rdev); } #ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_leave(rdev, wdev); } static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered) { if (!rdev->wiphy.wowlan_config) { cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); } cfg80211_process_wiphy_works(rdev, NULL); if (rdev->ops->suspend) ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config); if (ret == 1) { /* Driver refuse to configure wowlan */ cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); cfg80211_process_wiphy_works(rdev, NULL); ret = rdev_suspend(rdev, NULL); } if (ret == 0) rdev->suspended = true; } wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; } static int wiphy_resume(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; /* Age scan results with time spent in suspend */ cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; queue_work(system_unbound_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return ret; } static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); #define WIPHY_PM_OPS (&wiphy_pm_ops) #else #define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(const struct device *d) { struct wiphy *wiphy = container_of(d, struct wiphy, dev); return wiphy_net(wiphy); } struct class ieee80211_class = { .name = "ieee80211", .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; int wiphy_sysfs_init(void) { return class_register(&ieee80211_class); } void wiphy_sysfs_exit(void) { class_unregister(&ieee80211_class); }
5 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 // SPDX-License-Identifier: GPL-2.0 #ifndef IOU_FILE_TABLE_H #define IOU_FILE_TABLE_H #include <linux/file.h> #include <linux/io_uring_types.h> #include "rsrc.h" bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, unsigned int file_slot); int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); int io_register_file_alloc_range(struct io_ring_ctx *ctx, struct io_uring_file_index_range __user *arg); io_req_flags_t io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { WARN_ON_ONCE(!test_bit(bit, table->bitmap)); __clear_bit(bit, table->bitmap); table->alloc_hint = bit; } static inline void io_file_bitmap_set(struct io_file_table *table, int bit) { WARN_ON_ONCE(test_bit(bit, table->bitmap)); __set_bit(bit, table->bitmap); table->alloc_hint = bit + 1; } #define FFS_NOWAIT 0x1UL #define FFS_ISREG 0x2UL #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) static inline unsigned int io_slot_flags(struct io_rsrc_node *node) { return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; } static inline struct file *io_slot_file(struct io_rsrc_node *node) { return (struct file *)(node->file_ptr & FFS_MASK); } static inline void io_fixed_file_set(struct io_rsrc_node *node, struct file *file) { node->file_ptr = (unsigned long)file | (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx, unsigned off, unsigned len) { ctx->file_alloc_start = off; ctx->file_alloc_end = off + len; ctx->file_table.alloc_hint = ctx->file_alloc_start; } #endif
6278 2889 4461 2871 2873 4430 4454 4430 4456 4464 6271 4452 72 70 4393 4431 4405 4093 413 349 349 6336 6592 6602 6271 6268 4495 4495 6263 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-only /* * This implements the various checks for CONFIG_HARDENED_USERCOPY*, * which are designed to protect kernel memory from needless exposure * and overwrite under many unintended conditions. This code is based * on PAX_USERCOPY, which is: * * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source * Security Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/highmem.h> #include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include <asm/sections.h> #include "slab.h" /* * Checks if a given pointer and length is contained by the current * stack frame (if possible). * * Returns: * NOT_STACK: not at all on the stack * GOOD_FRAME: fully within a valid stack frame * GOOD_STACK: within the current stack (when can't frame-check exactly) * BAD_STACK: error condition (invalid stack position or bad stack frame) */ static noinline int check_stack_object(const void *obj, unsigned long len) { const void * const stack = task_stack_page(current); const void * const stackend = stack + THREAD_SIZE; int ret; /* Object is not on the stack at all. */ if (obj + len <= stack || stackend <= obj) return NOT_STACK; /* * Reject: object partially overlaps the stack (passing the * check above means at least one end is within the stack, * so if this check fails, the other end is outside the stack). */ if (obj < stack || stackend < obj + len) return BAD_STACK; /* Check if object is safely within a valid frame. */ ret = arch_within_stack_frames(stack, stackend, obj, len); if (ret) return ret; /* Finally, check stack depth if possible. */ #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER if (IS_ENABLED(CONFIG_STACK_GROWSUP)) { if ((void *)current_stack_pointer < obj + len) return BAD_STACK; } else { if (obj < (void *)current_stack_pointer) return BAD_STACK; } #endif return GOOD_STACK; } /* * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found * an unexpected state during a copy_from_user() or copy_to_user() call. * There are several checks being performed on the buffer by the * __check_object_size() function. Normal stack buffer usage should never * trip the checks, and kernel text addressing will always trip the check. * For cache objects, it is checking that only the whitelisted range of * bytes for a given cache is being accessed (via the cache's usersize and * useroffset fields). To adjust a cache whitelist, use the usercopy-aware * kmem_cache_create_usercopy() function to create the cache (and * carefully audit the whitelist range). */ void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len) { pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", to_user ? "exposure" : "overwrite", to_user ? "from" : "to", name ? : "unknown?!", detail ? " '" : "", detail ? : "", detail ? "'" : "", offset, len); /* * For greater effect, it would be nice to do do_group_exit(), * but BUG() actually hooks all the lock-breaking and per-arch * Oops code, so that is used here instead. */ BUG(); } /* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ static bool overlaps(const unsigned long ptr, unsigned long n, unsigned long low, unsigned long high) { const unsigned long check_low = ptr; unsigned long check_high = check_low + n; /* Does not overlap if entirely above or entirely below. */ if (check_low >= high || check_high <= low) return false; return true; } /* Is this address range in the kernel text area? */ static inline void check_kernel_text_object(const unsigned long ptr, unsigned long n, bool to_user) { unsigned long textlow = (unsigned long)_stext; unsigned long texthigh = (unsigned long)_etext; unsigned long textlow_linear, texthigh_linear; if (overlaps(ptr, n, textlow, texthigh)) usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n); /* * Some architectures have virtual memory mappings with a secondary * mapping of the kernel text, i.e. there is more than one virtual * kernel address that points to the kernel image. It is usually * when there is a separate linear physical memory mapping, in that * __pa() is not just the reverse of __va(). This can be detected * and checked: */ textlow_linear = (unsigned long)lm_alias(textlow); /* No different mapping: we're done. */ if (textlow_linear == textlow) return; /* Check the secondary mapping... */ texthigh_linear = (unsigned long)lm_alias(texthigh); if (overlaps(ptr, n, textlow_linear, texthigh_linear)) usercopy_abort("linear kernel text", NULL, to_user, ptr - textlow_linear, n); } static inline void check_bogus_address(const unsigned long ptr, unsigned long n, bool to_user) { /* Reject if object wraps past end of memory. */ if (ptr + (n - 1) < ptr) usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n); /* Reject if NULL or ZERO-allocation. */ if (ZERO_OR_NULL_PTR(ptr)) usercopy_abort("null address", NULL, to_user, ptr, n); } static inline void check_heap_object(const void *ptr, unsigned long n, bool to_user) { unsigned long addr = (unsigned long)ptr; unsigned long offset; struct folio *folio; if (is_kmap_addr(ptr)) { offset = offset_in_page(ptr); if (n > PAGE_SIZE - offset) usercopy_abort("kmap", NULL, to_user, offset, n); return; } if (is_vmalloc_addr(ptr) && !pagefault_disabled()) { struct vmap_area *area = find_vmap_area(addr); if (!area) usercopy_abort("vmalloc", "no area", to_user, 0, n); if (n > area->va_end - addr) { offset = addr - area->va_start; usercopy_abort("vmalloc", NULL, to_user, offset, n); } return; } if (!virt_addr_valid(ptr)) return; folio = virt_to_folio(ptr); if (folio_test_slab(folio)) { /* Check slab allocator for flags and size. */ __check_heap_object(ptr, n, folio_slab(folio), to_user); } else if (folio_test_large(folio)) { offset = ptr - folio_address(folio); if (n > folio_size(folio) - offset) usercopy_abort("page alloc", NULL, to_user, offset, n); } } static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); /* * Validates that the given object is: * - not bogus address * - fully contained by stack (or stack frame, when available) * - fully within SLAB object (or object whitelist area, when available) * - not in kernel text */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { if (static_branch_unlikely(&bypass_usercopy_checks)) return; /* Skip all tests if size is zero. */ if (!n) return; /* Check for invalid addresses. */ check_bogus_address((const unsigned long)ptr, n, to_user); /* Check for bad stack object. */ switch (check_stack_object(ptr, n)) { case NOT_STACK: /* Object is not touching the current process stack. */ break; case GOOD_FRAME: case GOOD_STACK: /* * Object is either in the correct frame (when it * is possible to check) or just generally on the * process stack (when frame checking not available). */ return; default: usercopy_abort("process stack", NULL, to_user, #ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER IS_ENABLED(CONFIG_STACK_GROWSUP) ? ptr - (void *)current_stack_pointer : (void *)current_stack_pointer - ptr, #else 0, #endif n); } /* Check for bad heap object. */ check_heap_object(ptr, n, to_user); /* Check for object in kernel to avoid text exposure. */ check_kernel_text_object((const unsigned long)ptr, n, to_user); } EXPORT_SYMBOL(__check_object_size); static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { if (kstrtobool(str, &enable_checks)) pr_warn("Invalid option string for hardened_usercopy: '%s'\n", str); return 1; } __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { if (enable_checks == false) static_branch_enable(&bypass_usercopy_checks); return 1; } late_initcall(set_hardened_usercopy);
31 23 23 22 22 29 29 29 27 28 3 28 11 8 28 28 28 20 26 17 18 36 36 18 31 30 17 17 29 27 27 2 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ /* * Basic idea behind the notification queue: An fsnotify group (like inotify) * sends the userspace notification about events asynchronously some time after * the event happened. When inotify gets an event it will need to add that * event to the group notify queue. Since a single event might need to be on * multiple group's notification queues we can't add the event directly to each * queue and instead add a small "event_holder" to each queue. This event_holder * has a pointer back to the original event. Since the majority of events are * going to end up on one, and only one, notification queue we embed one * event_holder into each event. This means we have a single allocation instead * of always needing two. If the embedded event_holder is already in use by * another group a new event_holder (from fsnotify_event_holder_cachep) will be * allocated and used. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/path.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); /** * fsnotify_get_cookie - return a unique cookie for use in synchronizing events. * Called from fsnotify_move, which is inlined into filesystem modules. */ u32 fsnotify_get_cookie(void) { return atomic_inc_return(&fsnotify_sync_cookie); } EXPORT_SYMBOL_GPL(fsnotify_get_cookie); void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event) { /* Overflow events are per-group and we don't want to free them */ if (!event || event == group->overflow_event) return; /* * If the event is still queued, we have a problem... Do an unreliable * lockless check first to avoid locking in the common case. The * locking may be necessary for permission events which got removed * from the list by a different CPU than the one freeing the event. */ if (!list_empty(&event->list)) { spin_lock(&group->notification_lock); WARN_ON(!list_empty(&event->list)); spin_unlock(&group->notification_lock); } group->ops->free_event(group, event); } /* * Try to add an event to the notification queue. * The group can later pull this event off the queue to deal with. * The group can use the @merge hook to merge the event with a queued event. * The group can use the @insert hook to insert the event into hash table. * The function returns: * 0 if the event was added to a queue * 1 if the event was merged with some other queued event * 2 if the event was not queued - either the queue of events has overflown * or the group is shutting down. */ int fsnotify_insert_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *), void (*insert)(struct fsnotify_group *, struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; pr_debug("%s: group=%p event=%p\n", __func__, group, event); spin_lock(&group->notification_lock); if (group->shutdown) { spin_unlock(&group->notification_lock); return 2; } if (event == group->overflow_event || group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { spin_unlock(&group->notification_lock); return ret; } event = group->overflow_event; goto queue; } if (!list_empty(list) && merge) { ret = merge(group, event); if (ret) { spin_unlock(&group->notification_lock); return ret; } } queue: group->q_len++; list_add_tail(&event->list, list); if (insert) insert(group, event); spin_unlock(&group->notification_lock); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); return ret; } void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event) { assert_spin_locked(&group->notification_lock); /* * We need to init list head for the case of overflow event so that * check in fsnotify_add_event() works */ list_del_init(&event->list); group->q_len--; } /* * Return the first event on the notification list without removing it. * Returns NULL if the list is empty. */ struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); if (fsnotify_notify_queue_is_empty(group)) return NULL; return list_first_entry(&group->notification_list, struct fsnotify_event, list); } /* * Remove and return the first event from the notification list. It is the * responsibility of the caller to destroy the obtained event */ struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { struct fsnotify_event *event = fsnotify_peek_first_event(group); if (!event) return NULL; pr_debug("%s: group=%p event=%p\n", __func__, group, event); fsnotify_remove_queued_event(group, event); return event; } /* * Called when a group is being torn down to clean up any outstanding * event notifications. */ void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; spin_lock(&group->notification_lock); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_first_event(group); spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, event); spin_lock(&group->notification_lock); } spin_unlock(&group->notification_lock); }
863 865 865 867 863 864 867 858 2 2 2 1654 1660 1657 1655 1653 1652 1652 1658 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0 #include <linux/irq_work.h> #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); #endif /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently * running in the kernel or userspace. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a * reschedule IPI to force the targeted task to reschedule and run task_work. * This can be advantageous if there's no strict requirement that the * task_work be run as soon as possible, just whenever the task enters the * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the * current @task and if the current context is NMI. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism * in that case. * * Note: there is no ordering guarantee on works queued here. The task_work * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; int flags = notify & TWA_FLAGS; notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { /* * Record the work call stack in order to print it in KASAN * reports. * * Note that stack allocation can fail if TWAF_NO_ALLOC flag * is set and new page is needed to expand the stack buffer. */ if (flags & TWAF_NO_ALLOC) kasan_record_aux_stack_noalloc(work); else kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); do { if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: break; case TWA_RESUME: set_notify_resume(task); break; case TWA_SIGNAL: set_notify_signal(task); break; case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif default: WARN_ON_ONCE(1); break; } return 0; } /** * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data) { struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; if (likely(!task_work_pending(task))) return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); work = READ_ONCE(*pprev); while (work) { if (!match(work, data)) { pprev = &work->next; work = READ_ONCE(*pprev); } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); return work; } static bool task_work_func_match(struct callback_head *cb, void *data) { return cb->func == data; } /** * task_work_cancel_func - cancel a pending work matching a function added by task_work_add() * @task: the task which should execute the func's work * @func: identifies the func to match with a work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_func(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } static bool task_work_match(struct callback_head *cb, void *data) { return cb == data; } /** * task_work_cancel - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @cb: the callback to remove if queued * * Remove a callback from a task's queue if queued. * * RETURNS: * True if the callback was queued and got cancelled, false otherwise. */ bool task_work_cancel(struct task_struct *task, struct callback_head *cb) { struct callback_head *ret; ret = task_work_cancel_match(task, task_work_match, cb); return ret == cb; } /** * task_work_run - execute the works added by task_work_add() * * Flush the pending works. Should be used by the core kernel code. * Called before the task returns to the user-mode or stops, or when * it exits. In the latter case task_work_add() can no longer add the * new work after task_work_run() returns. */ void task_work_run(void) { struct task_struct *task = current; struct callback_head *work, *head, *next; for (;;) { /* * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ work = READ_ONCE(task->task_works); do { head = NULL; if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; /* * Synchronize with task_work_cancel_match(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ raw_spin_lock_irq(&task->pi_lock); raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; work->func(work); work = next; cond_resched(); } while (work); } }
4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 // SPDX-License-Identifier: GPL-2.0-only /* * mm/userfaultfd.c * * Copyright (C) 2015 Red Hat, Inc. */ #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) { /* Make sure that the dst range is fully within dst_vma. */ if (dst_end > dst_vma->vm_end) return false; /* * Check the vma is registered in uffd, this is required to * enforce the VM_MAYWRITE check done at uffd registration * time. */ if (!dst_vma->vm_userfaultfd_ctx.ctx) return false; return true; } static __always_inline struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; mmap_assert_locked(mm); vma = vma_lookup(mm, addr); if (!vma) vma = ERR_PTR(-ENOENT); else if (!(vma->vm_flags & VM_SHARED) && unlikely(anon_vma_prepare(vma))) vma = ERR_PTR(-ENOMEM); return vma; } #ifdef CONFIG_PER_VMA_LOCK /* * uffd_lock_vma() - Lookup and lock vma corresponding to @address. * @mm: mm to search vma in. * @address: address that the vma should contain. * * Should be called without holding mmap_lock. * * Return: A locked vma containing @address, -ENOENT if no vma is found, or * -ENOMEM if anon_vma couldn't be allocated. */ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, unsigned long address) { struct vm_area_struct *vma; vma = lock_vma_under_rcu(mm, address); if (vma) { /* * We know we're going to need to use anon_vma, so check * that early. */ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) vma_end_read(vma); else return vma; } mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); if (!IS_ERR(vma)) { /* * We cannot use vma_start_read() as it may fail due to * false locked (see comment in vma_start_read()). We * can avoid that by directly locking vm_lock under * mmap_lock, which guarantees that nobody can lock the * vma for write (vma_start_write()) under us. */ down_read(&vma->vm_lock->lock); } mmap_read_unlock(mm); return vma; } static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len) { struct vm_area_struct *dst_vma; dst_vma = uffd_lock_vma(dst_mm, dst_start); if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) return dst_vma; vma_end_read(dst_vma); return ERR_PTR(-ENOENT); } static void uffd_mfill_unlock(struct vm_area_struct *vma) { vma_end_read(vma); } #else static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len) { struct vm_area_struct *dst_vma; mmap_read_lock(dst_mm); dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); if (IS_ERR(dst_vma)) goto out_unlock; if (validate_dst_vma(dst_vma, dst_start + len)) return dst_vma; dst_vma = ERR_PTR(-ENOENT); out_unlock: mmap_read_unlock(dst_mm); return dst_vma; } static void uffd_mfill_unlock(struct vm_area_struct *vma) { mmap_read_unlock(vma->vm_mm); } #endif /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) { struct inode *inode; pgoff_t offset, max_off; if (!dst_vma->vm_file) return false; inode = dst_vma->vm_file->f_inode; offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); return offset >= max_off; } /* * Install PTEs, to map dst_addr (within dst_vma) to page. * * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs. */ int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio); _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; if (writable) _dst_pte = pte_mkwrite(_dst_pte, dst_vma); if (flags & MFILL_ATOMIC_WP) _dst_pte = pte_mkuffd_wp(_dst_pte); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; goto out_unlock; } ret = -EEXIST; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock; if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, dst_vma); } /* * Must happen after rmap, as mm_counter() checks mapping (via * PageAnon()), which is set by __page_set_anon_rmap(). */ inc_mm_counter(dst_mm, mm_counter(folio)); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); out: return ret; } static int mfill_atomic_pte_copy(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { void *kaddr; int ret; struct folio *folio; if (!*foliop) { ret = -ENOMEM; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, dst_addr); if (!folio) goto out; kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock. */ pagefault_disable(); ret = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); pagefault_enable(); kunmap_local(kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; *foliop = folio; /* don't free the page */ goto out; } flush_dcache_folio(folio); } else { folio = *foliop; *foliop = NULL; } /* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __folio_mark_uptodate(folio); ret = -ENOMEM; if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) goto out_release; ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, flags); if (ret) goto out_release; out: return ret; out_release: folio_put(folio); goto out; } static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) { struct folio *folio; int ret = -ENOMEM; folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); if (!folio) return ret; if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) goto out_put; /* * The memory barrier inside __folio_mark_uptodate makes sure that * zeroing out the folio become visible before mapping the page * using set_pte_at(). See do_anonymous_page(). */ __folio_mark_uptodate(folio); ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, 0); if (ret) goto out_put; return 0; out_put: folio_put(folio); return ret; } static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) { pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; if (mm_forbids_zeropage(dst_vma->vm_mm)) return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; goto out_unlock; } ret = -EEXIST; if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); out: return ret; } /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, uffd_flags_t flags) { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); struct folio *folio; struct page *page; int ret; ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; if (ret) goto out; if (!folio) { ret = -EFAULT; goto out; } page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; goto out_release; } ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, page, false, flags); if (ret) goto out_release; folio_unlock(folio); ret = 0; out: return ret; out_release: folio_unlock(folio); folio_put(folio); goto out; } /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, uffd_flags_t flags) { int ret; struct mm_struct *dst_mm = dst_vma->vm_mm; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; _dst_pte = make_pte_marker(PTE_MARKER_POISONED); ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; goto out_unlock; } ret = -EEXIST; /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ if (!pte_none(ptep_get(dst_pte))) goto out_unlock; set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); out: return ret; } static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, address); if (!pud) return NULL; /* * Note that we didn't run this because the pmd was * missing, the *pmd may be already established and in * turn it may also be a trans_huge_pmd. */ return pmd_alloc(mm, pud, address); } #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is * called with either vma-lock or mmap_lock held, it will release the lock * before returning. */ static __always_inline ssize_t mfill_atomic_hugetlb( struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { struct mm_struct *dst_mm = dst_vma->vm_mm; ssize_t err; pte_t *dst_pte; unsigned long src_addr, dst_addr; long copied; struct folio *folio; unsigned long vma_hpagesize; pgoff_t idx; u32 hash; struct address_space *mapping; /* * There is no default zero huge page for all huge page sizes as * supported by hugetlb. A PMD_SIZE huge pages may exist as used * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); return -EINVAL; } src_addr = src_start; dst_addr = dst_start; copied = 0; folio = NULL; vma_hpagesize = vma_kernel_pagesize(dst_vma); /* * Validate alignment based on huge page size */ err = -EINVAL; if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) goto out_unlock; retry: /* * On routine entry dst_vma is set. If we had to drop mmap_lock and * retry, dst_vma will be set to NULL and we must lookup again. */ if (!dst_vma) { dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); if (IS_ERR(dst_vma)) { err = PTR_ERR(dst_vma); goto out; } err = -ENOENT; if (!is_vm_hugetlb_page(dst_vma)) goto out_unlock_vma; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) goto out_unlock_vma; /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ down_read(&ctx->map_changing_lock); err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock; } while (src_addr < src_start + len) { BUG_ON(dst_addr >= dst_start + len); /* * Serialize via vma_lock and hugetlb_fault_mutex. * vma_lock ensures the dst_pte remains valid even * in the case of shared pmds. fault mutex prevents * races with other faulting threads. */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(dst_vma); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { err = -EEXIST; hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, src_addr, flags, &folio); hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); cond_resched(); if (unlikely(err == -ENOENT)) { up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); BUG_ON(!folio); err = copy_folio_from_user(folio, (const void __user *)src_addr, true); if (unlikely(err)) { err = -EFAULT; goto out; } dst_vma = NULL; goto retry; } else BUG_ON(folio); if (!err) { dst_addr += vma_hpagesize; src_addr += vma_hpagesize; copied += vma_hpagesize; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&ctx->map_changing_lock); out_unlock_vma: uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); return copied ? copied : err; } #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop) { ssize_t err; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { return mfill_atomic_pte_continue(dst_pmd, dst_vma, dst_addr, flags); } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { return mfill_atomic_pte_poison(dst_pmd, dst_vma, dst_addr, flags); } /* * The normal page fault path for a shmem will invoke the * fault, fill the hole in the file and COW it right away. The * result generates plain anonymous memory. So when we are * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll * generate anonymous memory directly without actually filling * the hole. For the MAP_PRIVATE case the robustness check * only happens in the pagetable (to verify it's still none) * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) err = mfill_atomic_pte_copy(dst_pmd, dst_vma, dst_addr, src_addr, flags, foliop); else err = mfill_atomic_pte_zeropage(dst_pmd, dst_vma, dst_addr); } else { err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, foliop); } return err; } static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { struct mm_struct *dst_mm = ctx->mm; struct vm_area_struct *dst_vma; ssize_t err; pmd_t *dst_pmd; unsigned long src_addr, dst_addr; long copied; struct folio *folio; /* * Sanitize the command parameters: */ BUG_ON(dst_start & ~PAGE_MASK); BUG_ON(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ BUG_ON(src_start + len <= src_start); BUG_ON(dst_start + len <= dst_start); src_addr = src_start; dst_addr = dst_start; copied = 0; folio = NULL; retry: /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. */ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); if (IS_ERR(dst_vma)) { err = PTR_ERR(dst_vma); goto out; } /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ down_read(&ctx->map_changing_lock); err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -EINVAL; /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. */ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && dst_vma->vm_flags & VM_SHARED)) goto out_unlock; /* * validate 'mode' now that we know the dst_vma: don't allow * a wrprotect copy if the userfaultfd didn't register as WP. */ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) goto out_unlock; /* * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start, len, flags); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; if (!vma_is_shmem(dst_vma) && uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock; while (src_addr < src_start + len) { pmd_t dst_pmdval; BUG_ON(dst_addr >= dst_start + len); dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { err = -ENOMEM; break; } dst_pmdval = pmdp_get_lockless(dst_pmd); if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is THP don't override it and just be strict. * (This includes the case where the PMD used to be THP and * changed back to none after __pte_alloc().) */ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || pmd_devmap(dst_pmdval))) { err = -EEXIST; break; } if (unlikely(pmd_bad(dst_pmdval))) { err = -EFAULT; break; } /* * For shmem mappings, khugepaged is allowed to remove page * tables under us; pte_offset_map_lock() will deal with that. */ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); cond_resched(); if (unlikely(err == -ENOENT)) { void *kaddr; up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); BUG_ON(!folio); kaddr = kmap_local_folio(folio, 0); err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); kunmap_local(kaddr); if (unlikely(err)) { err = -EFAULT; goto out; } flush_dcache_folio(folio); goto retry; } else BUG_ON(folio); if (!err) { dst_addr += PAGE_SIZE; src_addr += PAGE_SIZE; copied += PAGE_SIZE; if (fatal_signal_pending(current)) err = -EINTR; } if (err) break; } out_unlock: up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); out: if (folio) folio_put(folio); BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); return copied ? copied : err; } ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags) { return mfill_atomic(ctx, dst_start, src_start, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); } ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len) { return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); } ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { /* * A caller might reasonably assume that UFFDIO_CONTINUE contains an * smp_wmb() to ensure that any writes to the about-to-be-mapped page by * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to * subsequent loads from the page through the newly mapped address range. */ smp_wmb(); return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, uffd_flags_t flags) { return mfill_atomic(ctx, start, 0, len, uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); } long uffd_wp_range(struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { unsigned int mm_cp_flags; struct mmu_gather tlb; long ret; VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, "The address range exceeds VMA boundary.\n"); if (enable_wp) mm_cp_flags = MM_CP_UFFD_WP; else mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; /* * vma->vm_page_prot already reflects that uffd-wp is enabled for this * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed * to be write-protected as default whenever protection changes. * Try upgrading write permissions manually. */ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; tlb_gather_mmu(&tlb, dst_vma->vm_mm); ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); tlb_finish_mmu(&tlb); return ret; } int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, unsigned long len, bool enable_wp) { struct mm_struct *dst_mm = ctx->mm; unsigned long end = start + len; unsigned long _start, _end; struct vm_area_struct *dst_vma; unsigned long page_mask; long err; VMA_ITERATOR(vmi, dst_mm, start); /* * Sanitize the command parameters: */ BUG_ON(start & ~PAGE_MASK); BUG_ON(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ BUG_ON(start + len <= start); mmap_read_lock(dst_mm); /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ down_read(&ctx->map_changing_lock); err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -ENOENT; for_each_vma_range(vmi, dst_vma, end) { if (!userfaultfd_wp(dst_vma)) { err = -ENOENT; break; } if (is_vm_hugetlb_page(dst_vma)) { err = -EINVAL; page_mask = vma_kernel_pagesize(dst_vma) - 1; if ((start & page_mask) || (len & page_mask)) break; } _start = max(dst_vma->vm_start, start); _end = min(dst_vma->vm_end, end); err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); /* Return 0 on success, <0 on failures */ if (err < 0) break; err = 0; } out_unlock: up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); return err; } void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2) __acquires(ptl1) __acquires(ptl2) { if (ptl1 > ptl2) swap(ptl1, ptl2); /* lock in virtual address order to avoid lock inversion */ spin_lock(ptl1); if (ptl1 != ptl2) spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); else __acquire(ptl2); } void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2) __releases(ptl1) __releases(ptl2) { spin_unlock(ptl1); if (ptl1 != ptl2) spin_unlock(ptl2); else __release(ptl2); } static int move_present_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, spinlock_t *dst_ptl, spinlock_t *src_ptl, struct folio *src_folio) { int err = 0; double_pt_lock(dst_ptl, src_ptl); if (!pte_same(ptep_get(src_pte), orig_src_pte) || !pte_same(ptep_get(dst_pte), orig_dst_pte)) { err = -EAGAIN; goto out; } if (folio_test_large(src_folio) || folio_maybe_dma_pinned(src_folio) || !PageAnonExclusive(&src_folio->page)) { err = -EBUSY; goto out; } orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { set_pte_at(mm, src_addr, src_pte, orig_src_pte); err = -EBUSY; goto out; } folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); out: double_pt_unlock(dst_ptl, src_ptl); return err; } static int move_swap_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, spinlock_t *dst_ptl, spinlock_t *src_ptl) { if (!pte_swp_exclusive(orig_src_pte)) return -EBUSY; double_pt_lock(dst_ptl, src_ptl); if (!pte_same(ptep_get(src_pte), orig_src_pte) || !pte_same(ptep_get(dst_pte), orig_dst_pte)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); return 0; } static int move_zeropage_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, spinlock_t *dst_ptl, spinlock_t *src_ptl) { pte_t zero_pte; double_pt_lock(dst_ptl, src_ptl); if (!pte_same(ptep_get(src_pte), orig_src_pte) || !pte_same(ptep_get(dst_pte), orig_dst_pte)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); ptep_clear_flush(src_vma, src_addr, src_pte); set_pte_at(mm, dst_addr, dst_pte, zero_pte); double_pt_unlock(dst_ptl, src_ptl); return 0; } /* * The mmap_lock for reading is held by the caller. Just move the page * from src_pmd to dst_pmd if possible, and return true if succeeded * in moving the page. */ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long dst_addr, unsigned long src_addr, __u64 mode) { swp_entry_t entry; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; pte_t *src_pte = NULL; pte_t *dst_pte = NULL; pmd_t dummy_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; int err = 0; flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr, src_addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); retry: /* * Use the maywrite version to indicate that dst_pte will be modified, * but since we will use pte_same() to detect the change of the pte * entry, there is no need to get pmdval, so just pass a dummy variable * to it. */ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval, &dst_ptl); /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { err = -EAGAIN; goto out; } /* same as dst_pte */ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, &src_ptl); /* * We held the mmap_lock for reading so MADV_DONTNEED * can zap transparent huge pages under us, or the * transparent huge page fault can establish new * transparent huge pages under us. */ if (unlikely(!src_pte)) { err = -EAGAIN; goto out; } /* Sanity checks before the operation */ if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { err = -EINVAL; goto out; } spin_lock(dst_ptl); orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { err = -EEXIST; goto out; } spin_lock(src_ptl); orig_src_pte = ptep_get(src_pte); spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) err = -ENOENT; else /* nothing to do to move a hole */ err = 0; goto out; } /* If PTE changed after we locked the folio them start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { err = -EAGAIN; goto out; } if (pte_present(orig_src_pte)) { if (is_zero_pfn(pte_pfn(orig_src_pte))) { err = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_ptl, src_ptl); goto out; } /* * Pin and lock both source folio and anon_vma. Since we are in * RCU read section, we can't block, so on contention have to * unmap the ptes, obtain the lock and retry. */ if (!src_folio) { struct folio *folio; /* * Pin the page while holding the lock to be sure the * page isn't freed under us */ spin_lock(src_ptl); if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); err = -EAGAIN; goto out; } folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !PageAnonExclusive(&folio->page)) { spin_unlock(src_ptl); err = -EBUSY; goto out; } folio_get(folio); src_folio = folio; src_folio_pte = orig_src_pte; spin_unlock(src_ptl); if (!folio_trylock(src_folio)) { pte_unmap(&orig_src_pte); pte_unmap(&orig_dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); goto retry; } if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { err = -EBUSY; goto out; } } /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ pte_unmap(&orig_src_pte); pte_unmap(&orig_dst_pte); src_pte = dst_pte = NULL; err = split_folio(src_folio); if (err) goto out; /* have to reacquire the folio after it got split */ folio_unlock(src_folio); folio_put(src_folio); src_folio = NULL; goto retry; } if (!src_anon_vma) { /* * folio_referenced walks the anon_vma chain * without the folio lock. Serialize against it with * the anon_vma lock, the folio lock is not enough. */ src_anon_vma = folio_get_anon_vma(src_folio); if (!src_anon_vma) { /* page was unmapped from under us */ err = -EAGAIN; goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { pte_unmap(&orig_src_pte); pte_unmap(&orig_dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ anon_vma_lock_write(src_anon_vma); goto retry; } } err = move_present_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_ptl, src_ptl, src_folio); } else { entry = pte_to_swp_entry(orig_src_pte); if (non_swap_entry(entry)) { if (is_migration_entry(entry)) { pte_unmap(&orig_src_pte); pte_unmap(&orig_dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); err = -EAGAIN; } else err = -EFAULT; goto out; } err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_ptl, src_ptl); } out: if (src_anon_vma) { anon_vma_unlock_write(src_anon_vma); put_anon_vma(src_anon_vma); } if (src_folio) { folio_unlock(src_folio); folio_put(src_folio); } if (dst_pte) pte_unmap(dst_pte); if (src_pte) pte_unmap(src_pte); mmu_notifier_invalidate_range_end(&range); return err; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline bool move_splits_huge_pmd(unsigned long dst_addr, unsigned long src_addr, unsigned long src_end) { return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || src_end - src_addr < HPAGE_PMD_SIZE; } #else static inline bool move_splits_huge_pmd(unsigned long dst_addr, unsigned long src_addr, unsigned long src_end) { /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ return false; } #endif static inline bool vma_move_compatible(struct vm_area_struct *vma) { return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | VM_MIXEDMAP | VM_SHADOW_STACK)); } static int validate_move_areas(struct userfaultfd_ctx *ctx, struct vm_area_struct *src_vma, struct vm_area_struct *dst_vma) { /* Only allow moving if both have the same access and protection */ if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) return -EINVAL; /* Only allow moving if both are mlocked or both aren't */ if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) return -EINVAL; /* * For now, we keep it simple and only move between writable VMAs. * Access flags are equal, therefore cheching only the source is enough. */ if (!(src_vma->vm_flags & VM_WRITE)) return -EINVAL; /* Check if vma flags indicate content which can be moved */ if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) return -EINVAL; /* Ensure dst_vma is registered in uffd we are operating on */ if (!dst_vma->vm_userfaultfd_ctx.ctx || dst_vma->vm_userfaultfd_ctx.ctx != ctx) return -EINVAL; /* Only allow moving across anonymous vmas */ if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) return -EINVAL; return 0; } static __always_inline int find_vmas_mm_locked(struct mm_struct *mm, unsigned long dst_start, unsigned long src_start, struct vm_area_struct **dst_vmap, struct vm_area_struct **src_vmap) { struct vm_area_struct *vma; mmap_assert_locked(mm); vma = find_vma_and_prepare_anon(mm, dst_start); if (IS_ERR(vma)) return PTR_ERR(vma); *dst_vmap = vma; /* Skip finding src_vma if src_start is in dst_vma */ if (src_start >= vma->vm_start && src_start < vma->vm_end) goto out_success; vma = vma_lookup(mm, src_start); if (!vma) return -ENOENT; out_success: *src_vmap = vma; return 0; } #ifdef CONFIG_PER_VMA_LOCK static int uffd_move_lock(struct mm_struct *mm, unsigned long dst_start, unsigned long src_start, struct vm_area_struct **dst_vmap, struct vm_area_struct **src_vmap) { struct vm_area_struct *vma; int err; vma = uffd_lock_vma(mm, dst_start); if (IS_ERR(vma)) return PTR_ERR(vma); *dst_vmap = vma; /* * Skip finding src_vma if src_start is in dst_vma. This also ensures * that we don't lock the same vma twice. */ if (src_start >= vma->vm_start && src_start < vma->vm_end) { *src_vmap = vma; return 0; } /* * Using uffd_lock_vma() to get src_vma can lead to following deadlock: * * Thread1 Thread2 * ------- ------- * vma_start_read(dst_vma) * mmap_write_lock(mm) * vma_start_write(src_vma) * vma_start_read(src_vma) * mmap_read_lock(mm) * vma_start_write(dst_vma) */ *src_vmap = lock_vma_under_rcu(mm, src_start); if (likely(*src_vmap)) return 0; /* Undo any locking and retry in mmap_lock critical section */ vma_end_read(*dst_vmap); mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (!err) { /* * See comment in uffd_lock_vma() as to why not using * vma_start_read() here. */ down_read(&(*dst_vmap)->vm_lock->lock); if (*dst_vmap != *src_vmap) down_read_nested(&(*src_vmap)->vm_lock->lock, SINGLE_DEPTH_NESTING); } mmap_read_unlock(mm); return err; } static void uffd_move_unlock(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { vma_end_read(src_vma); if (src_vma != dst_vma) vma_end_read(dst_vma); } #else static int uffd_move_lock(struct mm_struct *mm, unsigned long dst_start, unsigned long src_start, struct vm_area_struct **dst_vmap, struct vm_area_struct **src_vmap) { int err; mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); if (err) mmap_read_unlock(mm); return err; } static void uffd_move_unlock(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { mmap_assert_locked(src_vma->vm_mm); mmap_read_unlock(dst_vma->vm_mm); } #endif /** * move_pages - move arbitrary anonymous pages of an existing vma * @ctx: pointer to the userfaultfd context * @dst_start: start of the destination virtual memory range * @src_start: start of the source virtual memory range * @len: length of the virtual memory range * @mode: flags from uffdio_move.mode * * It will either use the mmap_lock in read mode or per-vma locks * * move_pages() remaps arbitrary anonymous pages atomically in zero * copy. It only works on non shared anonymous pages because those can * be relocated without generating non linear anon_vmas in the rmap * code. * * It provides a zero copy mechanism to handle userspace page faults. * The source vma pages should have mapcount == 1, which can be * enforced by using madvise(MADV_DONTFORK) on src vma. * * The thread receiving the page during the userland page fault * will receive the faulting page in the source vma through the network, * storage or any other I/O device (MADV_DONTFORK in the source vma * avoids move_pages() to fail with -EBUSY if the process forks before * move_pages() is called), then it will call move_pages() to map the * page in the faulting address in the destination vma. * * This userfaultfd command works purely via pagetables, so it's the * most efficient way to move physical non shared anonymous pages * across different virtual addresses. Unlike mremap()/mmap()/munmap() * it does not create any new vmas. The mapping in the destination * address is atomic. * * It only works if the vma protection bits are identical from the * source and destination vma. * * It can remap non shared anonymous pages within the same vma too. * * If the source virtual memory range has any unmapped holes, or if * the destination virtual memory range is not a whole unmapped hole, * move_pages() will fail respectively with -ENOENT or -EEXIST. This * provides a very strict behavior to avoid any chance of memory * corruption going unnoticed if there are userland race conditions. * Only one thread should resolve the userland page fault at any given * time for any given faulting address. This means that if two threads * try to both call move_pages() on the same destination address at the * same time, the second thread will get an explicit error from this * command. * * The command retval will return "len" is successful. The command * however can be interrupted by fatal signals or errors. If * interrupted it will return the number of bytes successfully * remapped before the interruption if any, or the negative error if * none. It will never return zero. Either it will return an error or * an amount of bytes successfully moved. If the retval reports a * "short" remap, the move_pages() command should be repeated by * userland with src+retval, dst+reval, len-retval if it wants to know * about the error that interrupted it. * * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to * prevent -ENOENT errors to materialize if there are holes in the * source virtual range that is being remapped. The holes will be * accounted as successfully remapped in the retval of the * command. This is mostly useful to remap hugepage naturally aligned * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. * * If there's any rmap walk that is taking the anon_vma locks without * first obtaining the folio lock (the only current instance is * folio_referenced), they will have to verify if the folio->mapping * has changed after taking the anon_vma lock. If it changed they * should release the lock and retry obtaining a new anon_vma, because * it means the anon_vma was changed by move_pages() before the lock * could be obtained. This is the only additional complexity added to * the rmap code to provide this anonymous page remapping functionality. */ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; unsigned long src_addr, dst_addr; pmd_t *src_pmd, *dst_pmd; long err = -EINVAL; ssize_t moved = 0; /* Sanitize the command parameters. */ if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || WARN_ON_ONCE(dst_start & ~PAGE_MASK) || WARN_ON_ONCE(len & ~PAGE_MASK)) goto out; /* Does the address range wrap, or is the span zero-sized? */ if (WARN_ON_ONCE(src_start + len <= src_start) || WARN_ON_ONCE(dst_start + len <= dst_start)) goto out; err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); if (err) goto out; /* Re-check after taking map_changing_lock */ err = -EAGAIN; down_read(&ctx->map_changing_lock); if (likely(atomic_read(&ctx->mmap_changing))) goto out_unlock; /* * Make sure the vma is not shared, that the src and dst remap * ranges are both valid and fully within a single existing * vma. */ err = -EINVAL; if (src_vma->vm_flags & VM_SHARED) goto out_unlock; if (src_start + len > src_vma->vm_end) goto out_unlock; if (dst_vma->vm_flags & VM_SHARED) goto out_unlock; if (dst_start + len > dst_vma->vm_end) goto out_unlock; err = validate_move_areas(ctx, src_vma, dst_vma); if (err) goto out_unlock; for (src_addr = src_start, dst_addr = dst_start; src_addr < src_start + len;) { spinlock_t *ptl; pmd_t dst_pmdval; unsigned long step_size; /* * Below works because anonymous area would not have a * transparent huge PUD. If file-backed support is added, * that case would need to be handled here. */ src_pmd = mm_find_pmd(mm, src_addr); if (unlikely(!src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; break; } src_pmd = mm_alloc_pmd(mm, src_addr); if (unlikely(!src_pmd)) { err = -ENOMEM; break; } } dst_pmd = mm_alloc_pmd(mm, dst_addr); if (unlikely(!dst_pmd)) { err = -ENOMEM; break; } dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is mapped as THP don't override it and just * be strict. If dst_pmd changes into TPH after this check, the * move_pages_huge_pmd() will detect the change and retry * while move_pages_pte() will detect the change and fail. */ if (unlikely(pmd_trans_huge(dst_pmdval))) { err = -EEXIST; break; } ptl = pmd_trans_huge_lock(src_pmd, src_vma); if (ptl) { if (pmd_devmap(*src_pmd)) { spin_unlock(ptl); err = -ENOENT; break; } /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { struct folio *folio = pmd_folio(*src_pmd); if (!folio || (!is_huge_zero_folio(folio) && !PageAnonExclusive(&folio->page))) { spin_unlock(ptl); err = -EBUSY; break; } spin_unlock(ptl); split_huge_pmd(src_vma, src_pmd, src_addr); /* The folio will be split by move_pages_pte() */ continue; } err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, dst_pmdval, dst_vma, src_vma, dst_addr, src_addr); step_size = HPAGE_PMD_SIZE; } else { if (pmd_none(*src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; break; } if (unlikely(__pte_alloc(mm, src_pmd))) { err = -ENOMEM; break; } } if (unlikely(pte_alloc(mm, dst_pmd))) { err = -ENOMEM; break; } err = move_pages_pte(mm, dst_pmd, src_pmd, dst_vma, src_vma, dst_addr, src_addr, mode); step_size = PAGE_SIZE; } cond_resched(); if (fatal_signal_pending(current)) { /* Do not override an error */ if (!err || err == -EAGAIN) err = -EINTR; break; } if (err) { if (err == -EAGAIN) continue; break; } /* Proceed to the next page */ dst_addr += step_size; src_addr += step_size; moved += step_size; } out_unlock: up_read(&ctx->map_changing_lock); uffd_move_unlock(dst_vma, src_vma); out: VM_WARN_ON(moved < 0); VM_WARN_ON(err > 0); VM_WARN_ON(!moved && !err); return moved ? moved : err; } static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t flags) { const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; vm_flags_reset(vma, flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. */ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) vma_set_page_prot(vma); } static void userfaultfd_set_ctx(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx, unsigned long flags) { vma_start_write(vma); vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; userfaultfd_set_vm_flags(vma, (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); } void userfaultfd_reset_ctx(struct vm_area_struct *vma) { userfaultfd_set_ctx(vma, NULL, 0); } struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct vm_area_struct *ret; /* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) uffd_wp_range(vma, start, end - start, false); ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, vma->vm_flags & ~__VM_UFFD_FLAGS, NULL_VM_UFFD_CTX); /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ if (!IS_ERR(ret)) userfaultfd_reset_ctx(ret); return ret; } /* Assumes mmap write lock taken, and mm_struct pinned. */ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, unsigned long vm_flags, unsigned long start, unsigned long end, bool wp_async) { VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; unsigned long new_flags; if (vma->vm_start < start) prev = vma; for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (vma->vm_userfaultfd_ctx.ctx == ctx && (vma->vm_flags & vm_flags) == vm_flags) goto skip; if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, new_flags, (struct vm_userfaultfd_ctx){ctx}); if (IS_ERR(vma)) return PTR_ERR(vma); /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ userfaultfd_set_ctx(vma, ctx, vm_flags); if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) hugetlb_unshare_all_pmds(vma); skip: prev = vma; start = vma->vm_end; } return 0; } void userfaultfd_release_new(struct userfaultfd_ctx *ctx) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == ctx) userfaultfd_reset_ctx(vma); } mmap_write_unlock(mm); } void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx) { struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, mm, 0); if (!mmget_not_zero(mm)) return; /* * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx * changes while handle_userfault released the mmap_lock. So * it's critical that released is set to true (above), before * taking the mmap_lock for writing. */ mmap_write_lock(mm); prev = NULL; for_each_vma(vmi, vma) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; } vma = userfaultfd_clear_vma(&vmi, prev, vma, vma->vm_start, vma->vm_end); prev = vma; } mmap_write_unlock(mm); mmput(mm); }
3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 1 1 1 1 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * The HSR spec says never to forward the same frame twice on the same * interface. A frame is identified by its source MAC address and its HSR * sequence number. This code keeps track of senders and their sequence numbers * to allow filtering of duplicate frames, and to detect HSR ring errors. * Same code handles filtering of duplicates for PRP as well. */ #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/rculist.h> #include "hsr_main.h" #include "hsr_framereg.h" #include "hsr_netlink.h" /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, * false otherwise. */ static bool seq_nr_after(u16 a, u16 b) { /* Remove inconsistency where * seq_nr_after(a, b) == seq_nr_before(a, b) */ if ((int)b - a == 32768) return false; return (((s16)(b - a)) < 0); } #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr) { if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox)) return false; return ether_addr_equal(addr, hsr->macaddress_redbox); } bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_self_node *sn; bool ret = false; rcu_read_lock(); sn = rcu_dereference(hsr->self_node); if (!sn) { WARN_ONCE(1, "HSR: No self node\n"); goto out; } if (ether_addr_equal(addr, sn->macaddress_A) || ether_addr_equal(addr, sn->macaddress_B)) ret = true; out: rcu_read_unlock(); return ret; } /* Search for mac entry. Caller must hold rcu read lock. */ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { struct hsr_node *node; list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, addr)) return node; } return NULL; } /* Check if node for a given MAC address is already present in data base */ bool hsr_is_node_in_db(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { return !!find_node_by_addr_A(node_db, addr); } /* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], const unsigned char addr_b[ETH_ALEN]) { struct hsr_self_node *sn, *old; sn = kmalloc(sizeof(*sn), GFP_KERNEL); if (!sn) return -ENOMEM; ether_addr_copy(sn->macaddress_A, addr_a); ether_addr_copy(sn->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, sn, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); return 0; } void hsr_del_self_node(struct hsr_priv *hsr) { struct hsr_self_node *old; spin_lock_bh(&hsr->list_lock); old = rcu_replace_pointer(hsr->self_node, NULL, lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); if (old) kfree_rcu(old, rcu_head); } void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; struct hsr_node *tmp; list_for_each_entry_safe(node, tmp, node_db, mac_list) kfree(node); } void prp_handle_san_frame(bool san, enum hsr_port_type port, struct hsr_node *node) { /* Mark if the SAN node is over LAN_A or LAN_B */ if (port == HSR_PT_SLAVE_A) { node->san_a = true; return; } if (port == HSR_PT_SLAVE_B) node->san_b = true; } /* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; * seq_out is used to initialize filtering of outgoing duplicate frames * originating from the newly added node. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, unsigned char addr[], u16 seq_out, bool san, enum hsr_port_type rx_port) { struct hsr_node *new_node, *node; unsigned long now; int i; new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); if (!new_node) return NULL; ether_addr_copy(new_node->macaddress_A, addr); spin_lock_init(&new_node->seq_out_lock); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; new_node->time_out[i] = now; } for (i = 0; i < HSR_PT_PORTS; i++) new_node->seq_out[i] = seq_out; if (san && hsr->proto_ops->handle_san_frame) hsr->proto_ops->handle_san_frame(san, rx_port, new_node); spin_lock_bh(&hsr->list_lock); list_for_each_entry_rcu(node, node_db, mac_list, lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) goto out; } list_add_tail_rcu(&new_node->mac_list, node_db); spin_unlock_bh(&hsr->list_lock); return new_node; out: spin_unlock_bh(&hsr->list_lock); kfree(new_node); return node; } void prp_update_san_info(struct hsr_node *node, bool is_sup) { if (!is_sup) return; node->san_a = false; node->san_b = false; } /* Get the hsr_node from which 'skb' was sent. */ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct sk_buff *skb, bool is_sup, enum hsr_port_type rx_port) { struct hsr_priv *hsr = port->hsr; struct hsr_node *node; struct ethhdr *ethhdr; struct prp_rct *rct; bool san = false; u16 seq_out; if (!skb_mac_header_was_set(skb)) return NULL; ethhdr = (struct ethhdr *)skb_mac_header(skb); list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Check if required node is not in proxy nodes table */ list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); return node; } } /* Everyone may create a node entry, connected node to a HSR/PRP * device. */ if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return NULL; /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { seq_out = prp_get_skb_sequence_nr(rct); } else { if (rx_port != HSR_PT_MASTER) san = true; seq_out = HSR_SEQNR_START; } } return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, san, rx_port); } /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate * node. */ void hsr_handle_sup_frame(struct hsr_frame_info *frame) { struct hsr_node *node_curr = frame->node_src; struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_sup_tlv; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; unsigned int pull_size = 0; unsigned int total_pull_size = 0; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol * header info. */ if (frame->skb_hsr) skb = frame->skb_hsr; else if (frame->skb_prp) skb = frame->skb_prp; else if (frame->skb_std) skb = frame->skb_std; if (!skb) return; /* Leave the ethernet header. */ pull_size = sizeof(struct ethhdr); skb_pull(skb, pull_size); total_pull_size += pull_size; ethhdr = (struct ethhdr *)skb_mac_header(skb); /* And leave the HSR tag. */ if (ethhdr->h_proto == htons(ETH_P_HSR)) { pull_size = sizeof(struct hsr_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; } /* And leave the HSR sup tag. */ pull_size = sizeof(struct hsr_sup_tag); skb_pull(skb, pull_size); total_pull_size += pull_size; /* get HSR sup payload */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Merge node_curr (registered on macaddress_B) into node_real */ node_db = &port_rcv->hsr->node_db; node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1, true, port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) /* Node has already been merged */ goto done; /* Leave the first HSR sup payload. */ pull_size = sizeof(struct hsr_sup_payload); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get second supervision tlv */ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; /* And check if it is a redbox mac TLV */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* We could stop here after pushing hsr_sup_payload, * or proceed and allow macaddress_B and for redboxes. */ /* Sanity check length */ if (hsr_sup_tlv->HSR_TLV_length != 6) goto done; /* Leave the second HSR sup tlv. */ pull_size = sizeof(struct hsr_sup_tlv); skb_pull(skb, pull_size); total_pull_size += pull_size; /* Get redbox mac address. */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Check if redbox mac and node mac are equal. */ if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) { /* This is a redbox supervision frame for a VDAN! */ goto done; } } ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); spin_lock_bh(&node_real->seq_out_lock); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { node_real->time_in[i] = node_curr->time_in[i]; node_real->time_in_stale[i] = node_curr->time_in_stale[i]; } if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) node_real->seq_out[i] = node_curr->seq_out[i]; } spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); if (!node_curr->removed) { list_del_rcu(&node_curr->mac_list); node_curr->removed = true; kfree_rcu(node_curr, rcu_head); } spin_unlock_bh(&hsr->list_lock); done: /* Push back here */ skb_push(skb, total_pull_size); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. * * If the frame was sent by a node's B interface, replace the source * address with that node's "official" address (macaddress_A) so that upper * layers recognize where it came from. */ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb) { if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__);